In [1]:
{
    "$schema": "http://json-schema.org/draft-07/schema#",
    "version": "0.0.1",
    "type": "object",
    "description": "Schema for storing and validating LLMs evaluation data, including model configuration, prompts, instances, Output, and evaluation metrics",
    "required": [
        "schema_version",
        "evaluation_id",
        "retrieved_timestamp",
        "source_data_urls",
        "leaderboard_name",
        "source_metadata",
        "model_info",
        "evaluation_results"
    ],
    "properties": {
        "schema_version": {
            "type": "string",
            "description": "Version of the schema used for this evaluation data"
        },
        "evaluation_id": {
            "type": "string",
            "description": "Unique identifier for this specific evaluation run. Use org_name/eval_name/retrieved_timestamp format"
        },
        "retrieved_timestamp": {
            "type": "string",
            "description": "Timestamp for when this record was created"
        },
        "source_data_urls": {
            "type": "array",
            "description": "URLs for the source of the evaluation data",
            "items": {
                "type": "string"
            }
        },
        "leaderboard_name": {
            "type": "string",
            "description": "Title of the source leaderboard for the evaluation."
        },
        "evaluation_platform_name": {
            "type": "string",
            "description": "Title of the platform used for the evaluation."
        },
        "source_metadata": {
            "type": "object",
            "description": "Metadata about the source of the leaderboard data",
            "required": [
                "source_organization_name",
                "evaluator_relationship"
            ],
            "properties": {
                "source_organization_name": {
                    "type": "string",
                    "description": "Name of the organization that provides the data"
                },
                "source_organization_url": {
                    "type": "string",
                    "description": "URL for the organization that provides the data"
                },
                "source_organization_logo_url": {
                    "type": "string",
                    "description": "URL for the Logo for the organization that provides the data"
                },
                "evaluator_relationship": {
                    "type": "string",
                    "description": "Relationship between the evaluator and the model",
                    "enum": [
                        "first_party",
                        "third_party",
                        "collaborative",
                        "other"
                    ]
                }
            }
        },
        "model_info": {
            "type": "object",
            "description": "Complete model specification including basic information, technical configuration and inference settings",
            "required": [
                    "name"
            ],
            "properties": {
                "name": {
                    "type": "string",
                    "description": "Model name in HuggingFace format (e.g. meta-llama/Llama-3.1-8B-Instruct)"
                },
                "developer": {
                    "type": "string",
                    "description": "Name of organization that provides the model (e.g. 'OpenAI')"
                },
                "inference_platform": {
                    "type": "string",
                    "description": "Description of platform used to run the evaluations (e.g. local machine, Bedrock)"
                }
            }
        },
        "evaluation_results": {
            "type": "array",
            "description": "Array of evaluation results",
            "items": {
                "type": "object",
                "required": [
                    "evaluation_name",
                    "metric_config",
                    "score_details"
                ],
                "properties": {
                    "evaluation_name": {
                        "type": "string",
                        "description": "Name of the evaluation"
                    },
                    "evaluation_timestamp": {
                        "type": "string",
                        "description": "Timestamp for when the evaluations were run"
                    },
                    "metric_config": {
                        "type": "object",
                        "description": "Details about the metric",
                        "required": [
                            "lower_is_better"
                        ],
                        "properties": {
                            "evaluation_description": {
                                "type": "string",
                                "description": "Description of the evaluation"
                            },
                            "lower_is_better": {
                                "type": "boolean",
                                "description": "Whether a lower score is better"
                            },
                            "score_type": {
                                "type": "string",
                                "description": "Type of score",
                                "enum": [
                                    "binary",
                                    "continuous",
                                    "levels"
                                ]
                            },
                            "level_names": {
                                "type": "array",
                                "description": "Names of the score levels",
                                "items": {
                                    "type": "string"
                                }
                            },
                            "level_metadata": {
                                "type": "array",
                                "description": "Additional Description for each Score Level",
                                "items": {
                                    "type": "string"
                                }
                            },
                            "has_unknown_level": {
                                "type": "boolean",
                                "description": "Indicates whether there is an Unknown Level - if True, then a score of -1 will be treated as Unknown"
                            },
                            "min_score": {
                                "type": "number",
                                "description": "Minimum possible score for continuous metric"
                            },
                            "max_score": {
                                "type": "number",
                                "description": "Maximum possible score for continuous metric"
                            }
                        },
                        "if": {
                            "properties": {
                                "score_type": {
                                    "const": "levels"
                                }
                            }
                        },
                        "then": {
                            "required": [
                                "level_names",
                                "has_unknown_level"
                            ]
                        },
                        "else": {
                            "if": {
                                "properties": {
                                    "score_type": {
                                        "const": "continuous"
                                    }
                                }
                            },
                            "then": {
                                "required": [
                                    "min_score",
                                    "max_score"
                                ]
                            }
                        }
                    },
                    "score_details": {
                        "type": "string",
                        "description": "The score for the evaluation and related details",
                        "required": [
                            "score"
                        ],
                        "properties": {
                            "score": {
                                "type": "number",
                                "description": "The score for the evaluation"
                            },
                            "details": {
                                "type": "object",
                                "description": "Any additional details about the score",
                                "additionalProperties": true
                            }
                        }
                    },
                    "detailed_evaluation_results_url": {
                        "type": "string",
                        "description": "Link to detailed evaluation data"
                    },
                    "generation_config": {
                        "type": "object",
                        "generation_args": {
                                "type": "object",
                                "description": "Parameters used to generate results - properties may vary by model type",
                                "properties": {
                                    "temperature": {
                                        "type": [
                                            "null",
                                            "number"
                                        ],
                                        "description": "Sampling temperature"
                                    },
                                    "top_p": {
                                        "type": [
                                            "null",
                                            "number"
                                        ],
                                        "description": "Nucleus sampling parameter"
                                    },
                                    "top_k": {
                                        "type": [
                                            "null",
                                            "number"
                                        ],
                                        "description": "Top-k sampling parameter"
                                    },
                                    "max_tokens": {
                                        "type": "integer",
                                        "minimum": 1,
                                        "description": "Maximum number of tokens to generate"
                                    }
                                },
                                "additionalProperties": true
                        },
                        "additional_details": {
                            "type": "string",
                            "description": "Additional details about how the results for this metric were generated."
                        }
                    }
                }
            }

        }
    }
}


NameError: name 'true' is not defined

In [None]:
[
  {
    "title": "Accuracy",
    "header": [
      {
        "value": "Model",
        "markdown": false,
        "metadata": {}
      },
      {
        "value": "Mean win rate",
        "description": "How many models this model outperforms on average (over columns).",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {}
      },
      {
        "value": "MedCalc-Bench - MedCalc Accuracy",
        "description": "A dataset which consists of a patient note, a question requesting to compute a specific medical value, and a ground truth answer [(Khandekar et al., 2024)](https://arxiv.org/abs/2406.12036).\n\nMedCalc Accuracy: Comparison based on category. Exact match for categories risk, severity and diagnosis. Check if within range for the other categories.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "MedCalc Accuracy",
          "run_group": "MedCalc-Bench"
        }
      },
      {
        "value": "CLEAR - EM",
        "description": "A dataset for evaluating the presence of a specific medical condition from patient notes with yes/no/maybe classifications [(Lopez et al., 2025)](https://www.nature.com/articles/s41746-024-01377-1).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EM",
          "run_group": "CLEAR"
        }
      },
      {
        "value": "MTSamples - Jury Score",
        "description": "A dataset of clinical notes where the model is prompted to generate the appropriate treatment plan for this patient [(MTSamples, 2025)](https://mtsamples.com).\n\nMTSamples Replicate Jury Score: Measures the average score assigned by an LLM-based jury evaluating task performance.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "Jury Score",
          "run_group": "MTSamples"
        }
      },
      {
        "value": "Medec - MedecFlagAcc",
        "description": "A dataset containing medical narratives with error detection and correction pairs [(Abacha et al., 2025)](https://arxiv.org/abs/2412.19260).\n\nMedical Error Flag Accuracy: Measures how accurately the model identifies whether a clinical note contains an error (binary classification of correct/incorrect).",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "MedecFlagAcc",
          "run_group": "Medec"
        }
      },
      {
        "value": "EHRSHOT - EM",
        "description": "A dataset given a patient record of EHR codes, classifying if an event will occur at a future date or not [(Wornow et al., 2023)](https://arxiv.org/abs/2307.02028).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EM",
          "run_group": "EHRSHOT"
        }
      },
      {
        "value": "HeadQA - EM",
        "description": "A collection of biomedical multiple-choice questions for testing medical knowledge [(Vilares et al., 2019)](https://arxiv.org/abs/1906.04701).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EM",
          "run_group": "HeadQA"
        }
      },
      {
        "value": "Medbullets - EM",
        "description": "A USMLE-style medical question dataset with multiple-choice answers and explanations [(MedBullets, 2025)](https://step2.medbullets.com).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EM",
          "run_group": "Medbullets"
        }
      },
      {
        "value": "MedAlign - Jury Score",
        "description": "A dataset that asks models to answer questions/follow instructions over longitudinal EHR [(Fleming et al., 2023)](https://arxiv.org/abs/2308.14089).\n\nMedalign Jury Score: Measures the average score assigned by an LLM-based jury evaluating task performance.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "Jury Score",
          "run_group": "MedAlign"
        }
      },
      {
        "value": "ADHD-Behavior - EM",
        "description": "A dataset that classifies whether a clinical note contains a clinician recommendation for parent training in behavior management, which is the first-line evidence-based treatment for young children with ADHD [(Pillai et al., 2024)](https://doi.org/10.1093/jamia/ocae001).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EM",
          "run_group": "ADHD-Behavior"
        }
      },
      {
        "value": "ADHD-MedEffects - EM",
        "description": "A dataset that classifies whether a clinical note contains documentation of side effect monitoring (recording of absence or presence of medication side effects), as recommended in clinical practice guidelines [(Bannet et al., 2024)](https://doi.org/10.1542/peds.2024-067223).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EM",
          "run_group": "ADHD-MedEffects"
        }
      },
      {
        "value": "DischargeMe - Jury Score",
        "description": "DischargeMe is a discharge instruction generation dataset and brief hospital course generation dataset collected from MIMIC-IV data, considering only the discharge text as well as the radiology report text [(Xu, 2024)](https://physionet.org/content/discharge-me/1.3/).\n\nDischargeMe Jury Score: Measures the average score assigned by an LLM-based jury evaluating task performance.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "Jury Score",
          "run_group": "DischargeMe"
        }
      },
      {
        "value": "ACI-Bench - Jury Score",
        "description": "A dataset of patient-doctor conversations paired with structured clinical notes [(Yim et al., 2024)](https://www.nature.com/articles/s41597-023-02487-3).\n\nACI-Bench Jury Score: Measures the average score assigned by an LLM-based jury evaluating task performance.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "Jury Score",
          "run_group": "ACI-Bench"
        }
      },
      {
        "value": "MTSamples Procedures - Jury Score",
        "description": "A dataset that provides a patient note regarding an operation, with the objective to document the procedure.\n\nMTSamples Procedures Jury Score: Measures the average score assigned by an LLM-based jury evaluating task performance.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "Jury Score",
          "run_group": "MTSamples Procedures"
        }
      },
      {
        "value": "MIMIC-RRS - Jury Score",
        "description": "A dataset containing radiology reports with findings sections from MIMIC-III paired with their corresponding impression sections, used for generating radiology report summaries [(Chen et al., 2023)](https://arxiv.org/abs/2211.08584).\n\nMIMIC-RRS Jury Score: Measures the average score assigned by an LLM-based jury evaluating task performance.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "Jury Score",
          "run_group": "MIMIC-RRS"
        }
      },
      {
        "value": "MIMIC-BHC - Jury Score",
        "description": "A summarization task using a curated collection of preprocessed discharge notes paired with their corresponding brief hospital course (BHC) summaries [(Aali et al., 2024)](https://doi.org/10.1093/jamia/ocae312).\n\nMIMIC-BHC Jury Score: Measures the average score assigned by an LLM-based jury evaluating task performance.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "Jury Score",
          "run_group": "MIMIC-BHC"
        }
      },
      {
        "value": "NoteExtract - Jury Score",
        "description": "A dataset containing free form text of a clinical health worker care plan, with the associated goal being to restructure that text into a given format.\n\nNoteExtract Jury Score: Measures the average score assigned by an LLM-based jury evaluating task performance.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "Jury Score",
          "run_group": "NoteExtract"
        }
      },
      {
        "value": "MedicationQA - Jury Score",
        "description": "Consumer medication questions with reference answers.\n\nMedicationQA Jury Score: Measures the average score assigned by an LLM-based jury evaluating task performance.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "Jury Score",
          "run_group": "MedicationQA"
        }
      },
      {
        "value": "PatientInstruct - Jury Score",
        "description": "A dataset containing case details used to generate customized post-procedure patient instructions.\n\nPatientInstruct Jury Score: Measures the average score assigned by an LLM-based jury evaluating task performance.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "Jury Score",
          "run_group": "PatientInstruct"
        }
      },
      {
        "value": "MedDialog - Jury Score",
        "description": "A collection of doctor-patient conversations with corresponding summaries.\n\nMedDialog Jury Score: Measures the average score assigned by an LLM-based jury evaluating task performance.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "Jury Score",
          "run_group": "MedDialog"
        }
      },
      {
        "value": "MedConfInfo - EM",
        "description": "A dataset of clinical notes from adolescent patients used to identify sensitive protected health information that should be restricted from parental access [(Rabbani et al., 2024)](https://jamanetwork.com/journals/jamapediatrics/fullarticle/2814109).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EM",
          "run_group": "MedConfInfo"
        }
      },
      {
        "value": "MEDIQA - Jury Score",
        "description": "A dataset including a medical question, a set of candidate answers, relevance annotations for ranking, and additional context to evaluate understanding and retrieval capabilities in a healthcare setting.\n\nMediQA Jury Score: Measures the average score assigned by an LLM-based jury evaluating task performance.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "Jury Score",
          "run_group": "MEDIQA"
        }
      },
      {
        "value": "MentalHealth - Jury Score",
        "description": "A dataset containing a counselor and mental health patient conversation, where the objective is to generate an empathetic counselor response.\n\nMentalHealth Jury Score: Measures the average score assigned by an LLM-based jury evaluating task performance.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "Jury Score",
          "run_group": "MentalHealth"
        }
      },
      {
        "value": "ProxySender - EM",
        "description": "This dataset features messages sent by proxy users and non proxy users, for evaluation of LLM capabilities to determine the sender [(Tse G, et al., 2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EM",
          "run_group": "ProxySender"
        }
      },
      {
        "value": "PrivacyDetection - EM",
        "description": "A dataset that determines if a message leaks any confidential information from the patient [(Tse G, et al., 2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EM",
          "run_group": "PrivacyDetection"
        }
      },
      {
        "value": "PubMedQA - EM",
        "description": "A dataset that provides pubmed abstracts and asks associated questions yes/no/maybe questions.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EM",
          "run_group": "PubMedQA"
        }
      },
      {
        "value": "EHRSQL - EHRSQLExeAcc",
        "description": "Given a natural language instruction, generate an SQL query that would be used in clinical research.\n\nExecution accuracy for Generated Query: Measures the proportion of correctly predicted answerable questions among all questions predicted to be answerable.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EHRSQLExeAcc",
          "run_group": "EHRSQL"
        }
      },
      {
        "value": "BMT-Status - EM",
        "description": "A dataset containing patient notes with associated questions and answers related to bone marrow transplantation.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EM",
          "run_group": "BMT-Status"
        }
      },
      {
        "value": "RaceBias - EM",
        "description": "A collection of LLM outputs in response to medical questions with race-based biases, with the objective being to classify whether the output contains racially biased content.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EM",
          "run_group": "RaceBias"
        }
      },
      {
        "value": "N2C2-CT - EM",
        "description": "A dataset that provides clinical notes and asks the model to classify whether the patient is a valid candidate for a provided clinical trial.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EM",
          "run_group": "N2C2-CT"
        }
      },
      {
        "value": "MedHallu - EM",
        "description": "A dataset of PubMed articles and associated questions, with the objective being to classify whether the answer is factual or hallucinated.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EM",
          "run_group": "MedHallu"
        }
      },
      {
        "value": "HospiceReferral - EM",
        "description": "A dataset evaluating performance in identifying appropriate patient referrals to hospice care.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EM",
          "run_group": "HospiceReferral"
        }
      },
      {
        "value": "MIMIC-IV Billing Code - MIMICBillingF1",
        "description": "A  dataset pairing clinical notes from MIMIC-IV with corresponding ICD-10 billing codes.\n\nF1 Score for MIMIC Billing Codes: Measures the harmonic mean of precision and recall for ICD codes, providing a balanced evaluation of the model's performance.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "MIMICBillingF1",
          "run_group": "MIMIC-IV Billing Code"
        }
      },
      {
        "value": "ClinicReferral - EM",
        "description": "A dataset containing manually curated answers to questions regarding patient referrals to the Sequoia clinic.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EM",
          "run_group": "ClinicReferral"
        }
      },
      {
        "value": "CDI-QA - EM",
        "description": "A dataset built from Clinical Document Integrity (CDI) notes, to assess the ability to answer verification questions from previous notes.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EM",
          "run_group": "CDI-QA"
        }
      },
      {
        "value": "ENT-Referral - EM",
        "description": "A dataset designed to evaluate performance in identifying appropriate patient referrals to Ear, Nose, and Throat specialists.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EM",
          "run_group": "ENT-Referral"
        }
      }
    ],
    "rows": [
      [
        {
          "value": "Claude 3.5 Sonnet (20241022)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.6339285714285714,
          "markdown": false
        },
        {
          "value": 0.218,
          "description": "min=0.218, mean=0.218, max=0.218, sum=0.218 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.6633709088997862,
          "description": "min=0.333, mean=0.663, max=0.836, sum=8.624 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=chronic_pain,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=homelessness,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=liver_disease,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=major_depression,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=personality_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=unemployment,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 4.382773874577155,
          "description": "min=4.383, mean=4.383, max=4.383, sum=4.383 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.5209380234505863,
          "description": "min=0.521, mean=0.521, max=0.521, sum=0.521 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.3583333333333334,
          "description": "min=0.136, mean=0.358, max=0.784, sum=1.075 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "ehrshot:subject=lab_anemia,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "ehrshot:subject=new_hypertension,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.906,
          "description": "min=0.906, mean=0.906, max=0.906, sum=0.906 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.6396103896103896,
          "description": "min=0.64, mean=0.64, max=0.64, sum=0.64 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 4.260999254287848,
          "description": "min=4.261, mean=4.261, max=4.261, sum=4.261 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "medalign:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.7683215130023641,
          "description": "min=0.768, mean=0.768, max=0.768, sum=0.768 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.9420765027322404,
          "description": "min=0.942, mean=0.942, max=0.942, sum=0.942 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 3.829388888888886,
          "description": "min=3.829, mean=3.829, max=3.829, sum=3.829 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 4.47037037037037,
          "description": "min=4.47, mean=4.47, max=4.47, sum=4.47 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 3.9348958333333357,
          "description": "min=3.935, mean=3.935, max=3.935, sum=3.935 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 4.478166666666664,
          "description": "min=4.478, mean=4.478, max=4.478, sum=4.478 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 4.078888888888891,
          "description": "min=4.079, mean=4.079, max=4.079, sum=4.079 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 4.948437143509013,
          "description": "min=4.948, mean=4.948, max=4.948, sum=4.948 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 4.581680374133206,
          "description": "min=4.582, mean=4.582, max=4.582, sum=4.582 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 3.7611572791628247,
          "description": "min=3.761, mean=3.761, max=3.761, sum=3.761 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 4.109611111111109,
          "description": "min=4.065, mean=4.11, max=4.155, sum=8.219 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "med_dialog,subset=icliniq:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.77,
          "description": "min=0.77, mean=0.77, max=0.77, sum=0.77 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 4.659259259259261,
          "description": "min=4.659, mean=4.659, max=4.659, sum=4.659 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 4.963515754560531,
          "description": "min=4.964, mean=4.964, max=4.964, sum=4.964 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.7366666666666667,
          "description": "min=0.737, mean=0.737, max=0.737, sum=0.737 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.8266666666666667,
          "description": "min=0.827, mean=0.827, max=0.827, sum=0.827 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.744,
          "description": "min=0.744, mean=0.744, max=0.744, sum=0.744 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.14,
          "description": "min=0.14, mean=0.14, max=0.14, sum=0.14 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.8727272727272727,
          "description": "min=0.873, mean=0.873, max=0.873, sum=0.873 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.7724550898203593,
          "description": "min=0.772, mean=0.772, max=0.772, sum=0.772 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.8333333333333334,
          "description": "min=0.756, mean=0.833, max=0.93, sum=2.5 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "n2c2_ct_matching:subject=CREATININE,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.926,
          "description": "min=0.926, mean=0.926, max=0.926, sum=0.926 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.741,
          "description": "min=0.741, mean=0.741, max=0.741, sum=0.741 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.35292484847092087,
          "description": "min=0.353, mean=0.353, max=0.353, sum=0.353 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimiciv_billing_code:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.908256880733945,
          "description": "min=0.908, mean=0.908, max=0.908, sum=0.908 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.602,
          "description": "min=0.602, mean=0.602, max=0.602, sum=0.602 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.532,
          "description": "min=0.532, mean=0.532, max=0.532, sum=0.532 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        }
      ],
      [
        {
          "value": "Claude 3.7 Sonnet (20250219)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.6357142857142857,
          "markdown": false
        },
        {
          "value": 0.21,
          "description": "min=0.21, mean=0.21, max=0.21, sum=0.21 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.6469657842337921,
          "description": "min=0.333, mean=0.647, max=0.836, sum=8.411 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=chronic_pain,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=homelessness,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=liver_disease,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=major_depression,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=personality_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=unemployment,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 4.425969294821761,
          "description": "min=4.426, mean=4.426, max=4.426, sum=4.426 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.628140703517588,
          "description": "min=0.628, mean=0.628, max=0.628, sum=0.628 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.4496666666666666,
          "description": "min=0.285, mean=0.45, max=0.715, sum=1.349 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "ehrshot:subject=lab_anemia,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "ehrshot:subject=new_hypertension,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.912,
          "description": "min=0.912, mean=0.912, max=0.912, sum=0.912 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.6493506493506493,
          "description": "min=0.649, mean=0.649, max=0.649, sum=0.649 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 4.187173750932139,
          "description": "min=4.187, mean=4.187, max=4.187, sum=4.187 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.8014184397163121,
          "description": "min=0.801, mean=0.801, max=0.801, sum=0.801 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.8163934426229508,
          "description": "min=0.816, mean=0.816, max=0.816, sum=0.816 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 3.803777777777774,
          "description": "min=3.804, mean=3.804, max=3.804, sum=3.804 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 4.617592592592595,
          "description": "min=4.618, mean=4.618, max=4.618, sum=4.618 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 4.0234375,
          "description": "min=4.023, mean=4.023, max=4.023, sum=4.023 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 4.5008888888888885,
          "description": "min=4.501, mean=4.501, max=4.501, sum=4.501 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 3.9911111111111097,
          "description": "min=3.991, mean=3.991, max=3.991, sum=3.991 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 4.949349760438056,
          "description": "min=4.949, mean=4.949, max=4.949, sum=4.949 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 4.628124496049018,
          "description": "min=4.628, mean=4.628, max=4.628, sum=4.628 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 3.6611265004616884,
          "description": "min=3.661, mean=3.661, max=3.661, sum=3.661 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 4.197555555555553,
          "description": "min=4.146, mean=4.198, max=4.249, sum=8.395 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "med_dialog,subset=icliniq:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.811,
          "description": "min=0.811, mean=0.811, max=0.811, sum=0.811 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 4.72814814814815,
          "description": "min=4.728, mean=4.728, max=4.728, sum=4.728 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 4.955223880597014,
          "description": "min=4.955, mean=4.955, max=4.955, sum=4.955 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.62,
          "description": "min=0.62, mean=0.62, max=0.62, sum=0.62 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.9966666666666667,
          "description": "min=0.997, mean=0.997, max=0.997, sum=0.997 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.625,
          "description": "min=0.625, mean=0.625, max=0.625, sum=0.625 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.081,
          "description": "min=0.081, mean=0.081, max=0.081, sum=0.081 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.9,
          "description": "min=0.9, mean=0.9, max=0.9, sum=0.9 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.6586826347305389,
          "description": "min=0.659, mean=0.659, max=0.659, sum=0.659 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.813953488372093,
          "description": "min=0.756, mean=0.814, max=0.895, sum=2.442 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "n2c2_ct_matching:subject=CREATININE,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.877,
          "description": "min=0.877, mean=0.877, max=0.877, sum=0.877 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.767,
          "description": "min=0.767, mean=0.767, max=0.767, sum=0.767 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.3554857455069554,
          "description": "min=0.355, mean=0.355, max=0.355, sum=0.355 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "mimiciv_billing_code:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.8868501529051988,
          "description": "min=0.887, mean=0.887, max=0.887, sum=0.887 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.57,
          "description": "min=0.57, mean=0.57, max=0.57, sum=0.57 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.585,
          "description": "min=0.585, mean=0.585, max=0.585, sum=0.585 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        }
      ],
      [
        {
          "value": "DeepSeek R1",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.6625,
          "style": {
            "font-weight": "bold"
          },
          "markdown": false
        },
        {
          "value": 0.348,
          "description": "min=0.348, mean=0.348, max=0.348, sum=0.348 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.8295041431536213,
          "description": "min=0.571, mean=0.83, max=0.985, sum=10.784 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=bipolar_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=chronic_pain,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=homelessness,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=liver_disease,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=major_depression,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=personality_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=substance_use_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=suicidal_behavior,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=tobacco_dependence,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=unemployment,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 4.533177205308355,
          "description": "min=4.533, mean=4.533, max=4.533, sum=4.533 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.5912897822445561,
          "description": "min=0.591, mean=0.591, max=0.591, sum=0.591 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.22033333333333335,
          "description": "min=0.019, mean=0.22, max=0.507, sum=0.661 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "ehrshot:subject=lab_anemia,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "ehrshot:subject=new_hypertension,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.721,
          "description": "min=0.721, mean=0.721, max=0.721, sum=0.721 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.6558441558441559,
          "description": "min=0.656, mean=0.656, max=0.656, sum=0.656 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 4.182326621923938,
          "description": "min=4.182, mean=4.182, max=4.182, sum=4.182 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.8085106382978723,
          "description": "min=0.809, mean=0.809, max=0.809, sum=0.809 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.9377049180327869,
          "description": "min=0.938, mean=0.938, max=0.938, sum=0.938 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 4.297000000000006,
          "description": "min=4.297, mean=4.297, max=4.297, sum=4.297 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "dischargeme:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 4.673148148148149,
          "description": "min=4.673, mean=4.673, max=4.673, sum=4.673 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "aci_bench:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 4.230902777777779,
          "description": "min=4.231, mean=4.231, max=4.231, sum=4.231 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 4.610499999999993,
          "description": "min=4.61, mean=4.61, max=4.61, sum=4.61 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 3.9188888888888855,
          "description": "min=3.919, mean=3.919, max=3.919, sum=3.919 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 4.665297741273103,
          "description": "min=4.665, mean=4.665, max=4.665, sum=4.665 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 4.584744396065148,
          "description": "min=4.585, mean=4.585, max=4.585, sum=4.585 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 4.784856879039718,
          "description": "min=4.785, mean=4.785, max=4.785, sum=4.785 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 4.435555555555553,
          "description": "min=4.361, mean=4.436, max=4.51, sum=8.871 (2)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "med_dialog,subset=icliniq:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.739,
          "description": "min=0.739, mean=0.739, max=0.739, sum=0.739 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 4.757777777777781,
          "description": "min=4.758, mean=4.758, max=4.758, sum=4.758 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 4.950248756218905,
          "description": "min=4.95, mean=4.95, max=4.95, sum=4.95 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.7433333333333333,
          "description": "min=0.743, mean=0.743, max=0.743, sum=0.743 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.98,
          "description": "min=0.98, mean=0.98, max=0.98, sum=0.98 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.743,
          "description": "min=0.743, mean=0.743, max=0.743, sum=0.743 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.076,
          "description": "min=0.076, mean=0.076, max=0.076, sum=0.076 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.7772727272727272,
          "description": "min=0.777, mean=0.777, max=0.777, sum=0.777 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.9161676646706587,
          "description": "min=0.916, mean=0.916, max=0.916, sum=0.916 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "race_based_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.8643410852713179,
          "description": "min=0.779, mean=0.864, max=0.93, sum=2.593 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "n2c2_ct_matching:subject=ADVANCED-CAD,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "n2c2_ct_matching:subject=CREATININE,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.847,
          "description": "min=0.847, mean=0.847, max=0.847, sum=0.847 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.701,
          "description": "min=0.701, mean=0.701, max=0.701, sum=0.701 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.2989618858066198,
          "description": "min=0.299, mean=0.299, max=0.299, sum=0.299 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimiciv_billing_code:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.8960244648318043,
          "description": "min=0.896, mean=0.896, max=0.896, sum=0.896 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.583,
          "description": "min=0.583, mean=0.583, max=0.583, sum=0.583 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.568,
          "description": "min=0.568, mean=0.568, max=0.568, sum=0.568 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        }
      ],
      [
        {
          "value": "Gemini 1.5 Pro (001)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.24107142857142858,
          "markdown": false
        },
        {
          "value": 0.12,
          "description": "min=0.12, mean=0.12, max=0.12, sum=0.12 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.6095005938613985,
          "description": "min=0.222, mean=0.61, max=0.859, sum=7.924 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=chronic_pain,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=homelessness,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=liver_disease,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=major_depression,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=personality_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=unemployment,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 4.384595368201935,
          "description": "min=4.385, mean=4.385, max=4.385, sum=4.385 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.52428810720268,
          "description": "min=0.524, mean=0.524, max=0.524, sum=0.524 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.5,
          "description": "min=0.139, mean=0.5, max=0.864, sum=1.5 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "ehrshot:subject=lab_anemia,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "ehrshot:subject=new_hypertension,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.84,
          "description": "min=0.84, mean=0.84, max=0.84, sum=0.84 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.4772727272727273,
          "description": "min=0.477, mean=0.477, max=0.477, sum=0.477 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 3.884787472035795,
          "description": "min=3.885, mean=3.885, max=3.885, sum=3.885 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.7612293144208038,
          "description": "min=0.761, mean=0.761, max=0.761, sum=0.761 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.7311475409836066,
          "description": "min=0.731, mean=0.731, max=0.731, sum=0.731 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 3.619333333333337,
          "description": "min=3.619, mean=3.619, max=3.619, sum=3.619 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 4.493518518518516,
          "description": "min=4.494, mean=4.494, max=4.494, sum=4.494 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 2.8090277777777786,
          "description": "min=2.809, mean=2.809, max=2.809, sum=2.809 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 4.209722222222227,
          "description": "min=4.21, mean=4.21, max=4.21, sum=4.21 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 3.6188888888888884,
          "description": "min=3.619, mean=3.619, max=3.619, sum=3.619 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 4.906913073237509,
          "description": "min=4.907, mean=4.907, max=4.907, sum=4.907 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 3.727624576681176,
          "description": "min=3.728, mean=3.728, max=3.728, sum=3.728 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 3.5575561711295847,
          "description": "min=3.558, mean=3.558, max=3.558, sum=3.558 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 3.723499999999989,
          "description": "min=3.633, mean=3.723, max=3.814, sum=7.447 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "med_dialog,subset=icliniq:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.74,
          "description": "min=0.74, mean=0.74, max=0.74, sum=0.74 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 4.225925925925926,
          "description": "min=4.226, mean=4.226, max=4.226, sum=4.226 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 4.378109452736319,
          "description": "min=4.378, mean=4.378, max=4.378, sum=4.378 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.7133333333333334,
          "description": "min=0.713, mean=0.713, max=0.713, sum=0.713 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.9533333333333334,
          "description": "min=0.953, mean=0.953, max=0.953, sum=0.953 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.625,
          "description": "min=0.625, mean=0.625, max=0.625, sum=0.625 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.152,
          "description": "min=0.152, mean=0.152, max=0.152, sum=0.152 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.7818181818181819,
          "description": "min=0.782, mean=0.782, max=0.782, sum=0.782 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.8203592814371258,
          "description": "min=0.82, mean=0.82, max=0.82, sum=0.82 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.7906976744186046,
          "description": "min=0.709, mean=0.791, max=0.86, sum=2.372 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "n2c2_ct_matching:subject=CREATININE,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.891,
          "description": "min=0.891, mean=0.891, max=0.891, sum=0.891 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.745,
          "description": "min=0.745, mean=0.745, max=0.745, sum=0.745 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.18605294710805736,
          "description": "min=0.186, mean=0.186, max=0.186, sum=0.186 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimiciv_billing_code:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.8593272171253823,
          "description": "min=0.859, mean=0.859, max=0.859, sum=0.859 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.466,
          "description": "min=0.466, mean=0.466, max=0.466, sum=0.466 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.576,
          "description": "min=0.576, mean=0.576, max=0.576, sum=0.576 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        }
      ],
      [
        {
          "value": "Gemini 2.0 Flash",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.41964285714285715,
          "markdown": false
        },
        {
          "value": 0.158,
          "description": "min=0.158, mean=0.158, max=0.158, sum=0.158 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.6873491012925947,
          "description": "min=0.352, mean=0.687, max=0.87, sum=8.936 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=chronic_pain,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=homelessness,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=liver_disease,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=major_depression,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=personality_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=unemployment,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 4.116835805360402,
          "description": "min=4.117, mean=4.117, max=4.117, sum=4.117 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.5963149078726968,
          "description": "min=0.596, mean=0.596, max=0.596, sum=0.596 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.5026666666666667,
          "description": "min=0.137, mean=0.503, max=0.816, sum=1.508 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "ehrshot:subject=lab_anemia,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "ehrshot:subject=new_hypertension,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.88,
          "description": "min=0.88, mean=0.88, max=0.88, sum=0.88 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.6298701298701299,
          "description": "min=0.63, mean=0.63, max=0.63, sum=0.63 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 3.715883668903806,
          "description": "min=3.716, mean=3.716, max=3.716, sum=3.716 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.7281323877068558,
          "description": "min=0.728, mean=0.728, max=0.728, sum=0.728 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.9355191256830601,
          "description": "min=0.936, mean=0.936, max=0.936, sum=0.936 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 3.4546666666666686,
          "description": "min=3.455, mean=3.455, max=3.455, sum=3.455 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 4.612037037037038,
          "description": "min=4.612, mean=4.612, max=4.612, sum=4.612 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 3.641493055555557,
          "description": "min=3.641, mean=3.641, max=3.641, sum=3.641 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 4.248500000000007,
          "description": "min=4.249, mean=4.249, max=4.249, sum=4.249 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 3.8299999999999987,
          "description": "min=3.83, mean=3.83, max=3.83, sum=3.83 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 4.927104722792611,
          "description": "min=4.927, mean=4.927, max=4.927, sum=4.927 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 4.428801806160294,
          "description": "min=4.429, mean=4.429, max=4.429, sum=4.429 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 3.4361341951369657,
          "description": "min=3.436, mean=3.436, max=3.436, sum=3.436 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 4.224527777777782,
          "description": "min=4.178, mean=4.225, max=4.271, sum=8.449 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "med_dialog,subset=icliniq:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.75,
          "description": "min=0.75, mean=0.75, max=0.75, sum=0.75 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 4.767407407407409,
          "description": "min=4.767, mean=4.767, max=4.767, sum=4.767 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 4.439469320066335,
          "description": "min=4.439, mean=4.439, max=4.439, sum=4.439 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.7466666666666667,
          "description": "min=0.747, mean=0.747, max=0.747, sum=0.747 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.9066666666666666,
          "description": "min=0.907, mean=0.907, max=0.907, sum=0.907 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.64,
          "description": "min=0.64, mean=0.64, max=0.64, sum=0.64 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.165,
          "description": "min=0.165, mean=0.165, max=0.165, sum=0.165 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.7954545454545454,
          "description": "min=0.795, mean=0.795, max=0.795, sum=0.795 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.8562874251497006,
          "description": "min=0.856, mean=0.856, max=0.856, sum=0.856 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.7170542635658914,
          "description": "min=0.523, mean=0.717, max=0.93, sum=2.151 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "n2c2_ct_matching:subject=CREATININE,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.908,
          "description": "min=0.908, mean=0.908, max=0.908, sum=0.908 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.597,
          "description": "min=0.597, mean=0.597, max=0.597, sum=0.597 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.23228162127907237,
          "description": "min=0.232, mean=0.232, max=0.232, sum=0.232 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimiciv_billing_code:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.8990825688073395,
          "description": "min=0.899, mean=0.899, max=0.899, sum=0.899 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.566,
          "description": "min=0.566, mean=0.566, max=0.566, sum=0.566 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.505,
          "description": "min=0.505, mean=0.505, max=0.505, sum=0.505 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        }
      ],
      [
        {
          "value": "GPT-4o (2024-05-13)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.5696428571428571,
          "markdown": false
        },
        {
          "value": 0.188,
          "description": "min=0.188, mean=0.188, max=0.188, sum=0.188 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.6421195891654634,
          "description": "min=0.308, mean=0.642, max=0.84, sum=8.348 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=chronic_pain,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=homelessness,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=liver_disease,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=major_depression,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=personality_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=unemployment,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 4.291438979963575,
          "description": "min=4.291, mean=4.291, max=4.291, sum=4.291 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.5795644891122278,
          "description": "min=0.58, mean=0.58, max=0.58, sum=0.58 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.5413333333333333,
          "description": "min=0.404, mean=0.541, max=0.738, sum=1.624 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "ehrshot:subject=lab_anemia,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "ehrshot:subject=new_hypertension,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.906,
          "description": "min=0.906, mean=0.906, max=0.906, sum=0.906 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.711038961038961,
          "description": "min=0.711, mean=0.711, max=0.711, sum=0.711 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 3.844146159582401,
          "description": "min=3.844, mean=3.844, max=3.844, sum=3.844 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.8368794326241135,
          "description": "min=0.837, mean=0.837, max=0.837, sum=0.837 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.9453551912568307,
          "description": "min=0.945, mean=0.945, max=0.945, sum=0.945 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 3.577999999999998,
          "description": "min=3.578, mean=3.578, max=3.578, sum=3.578 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 4.512037037037037,
          "description": "min=4.512, mean=4.512, max=4.512, sum=4.512 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 3.9088541666666687,
          "description": "min=3.909, mean=3.909, max=3.909, sum=3.909 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 4.191888888888877,
          "description": "min=4.192, mean=4.192, max=4.192, sum=4.192 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 3.9699999999999993,
          "description": "min=3.97, mean=3.97, max=3.97, sum=3.97 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 4.954825462012319,
          "description": "min=4.955, mean=4.955, max=4.955, sum=4.955 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 4.4162231898080915,
          "description": "min=4.416, mean=4.416, max=4.416, sum=4.416 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 3.620190827947067,
          "description": "min=3.62, mean=3.62, max=3.62, sum=3.62 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 4.211361111111113,
          "description": "min=4.146, mean=4.211, max=4.277, sum=8.423 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "med_dialog,subset=icliniq:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.772,
          "description": "min=0.772, mean=0.772, max=0.772, sum=0.772 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 4.608888888888893,
          "description": "min=4.609, mean=4.609, max=4.609, sum=4.609 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 4.497512437810945,
          "description": "min=4.498, mean=4.498, max=4.498, sum=4.498 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.72,
          "description": "min=0.72, mean=0.72, max=0.72, sum=0.72 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.9833333333333333,
          "description": "min=0.983, mean=0.983, max=0.983, sum=0.983 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.701,
          "description": "min=0.701, mean=0.701, max=0.701, sum=0.701 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.32,
          "description": "min=0.32, mean=0.32, max=0.32, sum=0.32 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.740909090909091,
          "description": "min=0.741, mean=0.741, max=0.741, sum=0.741 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.8982035928143712,
          "description": "min=0.898, mean=0.898, max=0.898, sum=0.898 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.8294573643410853,
          "description": "min=0.744, mean=0.829, max=0.942, sum=2.488 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "n2c2_ct_matching:subject=CREATININE,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.846,
          "description": "min=0.846, mean=0.846, max=0.846, sum=0.846 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.656,
          "description": "min=0.656, mean=0.656, max=0.656, sum=0.656 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.34586736846836813,
          "description": "min=0.346, mean=0.346, max=0.346, sum=0.346 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimiciv_billing_code:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.8960244648318043,
          "description": "min=0.896, mean=0.896, max=0.896, sum=0.896 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.589,
          "description": "min=0.589, mean=0.589, max=0.589, sum=0.589 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.611,
          "description": "min=0.611, mean=0.611, max=0.611, sum=0.611 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        }
      ],
      [
        {
          "value": "GPT-4o mini (2024-07-18)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.39285714285714285,
          "markdown": false
        },
        {
          "value": 0.154,
          "description": "min=0.154, mean=0.154, max=0.154, sum=0.154 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.663374374081707,
          "description": "min=0.333, mean=0.663, max=0.88, sum=8.624 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=chronic_pain,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=homelessness,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=liver_disease,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=major_depression,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=personality_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=unemployment,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 4.24668227946917,
          "description": "min=4.247, mean=4.247, max=4.247, sum=4.247 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.5326633165829145,
          "description": "min=0.533, mean=0.533, max=0.533, sum=0.533 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.9066666666666666,
          "description": "min=0.858, mean=0.907, max=0.993, sum=2.72 (3)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "ehrshot:subject=lab_anemia,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "ehrshot:subject=new_hypertension,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.832,
          "description": "min=0.832, mean=0.832, max=0.832, sum=0.832 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.564935064935065,
          "description": "min=0.565, mean=0.565, max=0.565, sum=0.565 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 3.6655480984340043,
          "description": "min=3.666, mean=3.666, max=3.666, sum=3.666 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.806146572104019,
          "description": "min=0.806, mean=0.806, max=0.806, sum=0.806 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.7562841530054645,
          "description": "min=0.756, mean=0.756, max=0.756, sum=0.756 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 3.691666666666669,
          "description": "min=3.692, mean=3.692, max=3.692, sum=3.692 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 4.369444444444443,
          "description": "min=4.369, mean=4.369, max=4.369, sum=4.369 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 3.9157986111111156,
          "description": "min=3.916, mean=3.916, max=3.916, sum=3.916 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 4.250444444444436,
          "description": "min=4.25, mean=4.25, max=4.25, sum=4.25 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 3.7944444444444447,
          "description": "min=3.794, mean=3.794, max=3.794, sum=3.794 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 4.942048825005705,
          "description": "min=4.942, mean=4.942, max=4.942, sum=4.942 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 4.329624254152552,
          "description": "min=4.33, mean=4.33, max=4.33, sum=4.33 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 3.6140350877193055,
          "description": "min=3.614, mean=3.614, max=3.614, sum=3.614 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 4.145166666666659,
          "description": "min=4.127, mean=4.145, max=4.163, sum=8.29 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "med_dialog,subset=icliniq:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.738,
          "description": "min=0.738, mean=0.738, max=0.738, sum=0.738 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 4.56814814814815,
          "description": "min=4.568, mean=4.568, max=4.568, sum=4.568 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 4.540630182421227,
          "description": "min=4.541, mean=4.541, max=4.541, sum=4.541 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.7333333333333333,
          "description": "min=0.733, mean=0.733, max=0.733, sum=0.733 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.96,
          "description": "min=0.96, mean=0.96, max=0.96, sum=0.96 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.672,
          "description": "min=0.672, mean=0.672, max=0.672, sum=0.672 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.112,
          "description": "min=0.112, mean=0.112, max=0.112, sum=0.112 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.8363636363636363,
          "description": "min=0.836, mean=0.836, max=0.836, sum=0.836 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.7664670658682635,
          "description": "min=0.766, mean=0.766, max=0.766, sum=0.766 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.7829457364341085,
          "description": "min=0.733, mean=0.783, max=0.872, sum=2.349 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "n2c2_ct_matching:subject=CREATININE,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.801,
          "description": "min=0.801, mean=0.801, max=0.801, sum=0.801 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.708,
          "description": "min=0.708, mean=0.708, max=0.708, sum=0.708 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.263243296579599,
          "description": "min=0.263, mean=0.263, max=0.263, sum=0.263 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimiciv_billing_code:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.8990825688073395,
          "description": "min=0.899, mean=0.899, max=0.899, sum=0.899 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.579,
          "description": "min=0.579, mean=0.579, max=0.579, sum=0.579 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.594,
          "description": "min=0.594, mean=0.594, max=0.594, sum=0.594 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        }
      ],
      [
        {
          "value": "Llama 3.3 Instruct (70B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.30357142857142855,
          "markdown": false
        },
        {
          "value": 0.113,
          "description": "min=0.113, mean=0.113, max=0.113, sum=0.113 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.8130430111845777,
          "description": "min=0.609, mean=0.813, max=0.926, sum=10.57 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=chronic_pain,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=homelessness,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=liver_disease,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=major_depression,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=personality_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=unemployment,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 4.144418423106954,
          "description": "min=4.144, mean=4.144, max=4.144, sum=4.144 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.5293132328308208,
          "description": "min=0.529, mean=0.529, max=0.529, sum=0.529 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.6443333333333333,
          "description": "min=0.194, mean=0.644, max=0.897, sum=1.933 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "ehrshot:subject=lab_anemia,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "ehrshot:subject=new_hypertension,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.854,
          "description": "min=0.854, mean=0.854, max=0.854, sum=0.854 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.6071428571428571,
          "description": "min=0.607, mean=0.607, max=0.607, sum=0.607 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 3.66666666666667,
          "description": "min=3.667, mean=3.667, max=3.667, sum=3.667 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.640661938534279,
          "description": "min=0.641, mean=0.641, max=0.641, sum=0.641 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.9245901639344263,
          "description": "min=0.925, mean=0.925, max=0.925, sum=0.925 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 3.567000000000002,
          "description": "min=3.567, mean=3.567, max=3.567, sum=3.567 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 4.124074074074071,
          "description": "min=4.124, mean=4.124, max=4.124, sum=4.124 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 3.8376736111111147,
          "description": "min=3.838, mean=3.838, max=3.838, sum=3.838 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 4.058111111111116,
          "description": "min=4.058, mean=4.058, max=4.058, sum=4.058 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 3.7677777777777766,
          "description": "min=3.768, mean=3.768, max=3.768, sum=3.768 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 4.923796486424825,
          "description": "min=4.924, mean=4.924, max=4.924, sum=4.924 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 4.490566037735847,
          "description": "min=4.491, mean=4.491, max=4.491, sum=4.491 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 3.5866420437057647,
          "description": "min=3.587, mean=3.587, max=3.587, sum=3.587 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 4.086388888888882,
          "description": "min=4.039, mean=4.086, max=4.133, sum=8.173 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "med_dialog,subset=icliniq:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.789,
          "description": "min=0.789, mean=0.789, max=0.789, sum=0.789 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 4.674074074074076,
          "description": "min=4.674, mean=4.674, max=4.674, sum=4.674 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 4.228855721393035,
          "description": "min=4.229, mean=4.229, max=4.229, sum=4.229 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.6833333333333333,
          "description": "min=0.683, mean=0.683, max=0.683, sum=0.683 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.9766666666666667,
          "description": "min=0.977, mean=0.977, max=0.977, sum=0.977 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.733,
          "description": "min=0.733, mean=0.733, max=0.733, sum=0.733 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.074,
          "description": "min=0.074, mean=0.074, max=0.074, sum=0.074 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.8363636363636363,
          "description": "min=0.836, mean=0.836, max=0.836, sum=0.836 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.5688622754491018,
          "description": "min=0.569, mean=0.569, max=0.569, sum=0.569 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.813953488372093,
          "description": "min=0.744, mean=0.814, max=0.884, sum=2.442 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "n2c2_ct_matching:subject=CREATININE,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.882,
          "description": "min=0.882, mean=0.882, max=0.882, sum=0.882 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.768,
          "description": "min=0.768, mean=0.768, max=0.768, sum=0.768 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.19714220046154962,
          "description": "min=0.197, mean=0.197, max=0.197, sum=0.197 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimiciv_billing_code:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.7828746177370031,
          "description": "min=0.783, mean=0.783, max=0.783, sum=0.783 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.563,
          "description": "min=0.563, mean=0.563, max=0.563, sum=0.563 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.363,
          "description": "min=0.363, mean=0.363, max=0.363, sum=0.363 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        }
      ],
      [
        {
          "value": "o3-mini (2025-01-31)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.6410714285714286,
          "markdown": false
        },
        {
          "value": 0.34,
          "description": "min=0.34, mean=0.34, max=0.34, sum=0.34 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.8305836653607831,
          "description": "min=0.707, mean=0.831, max=0.94, sum=10.798 (13)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=bipolar_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=chronic_pain,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=homelessness,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=liver_disease,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=major_depression,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=personality_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=substance_use_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=suicidal_behavior,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=tobacco_dependence,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=unemployment,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 4.603695029924544,
          "description": "min=4.604, mean=4.604, max=4.604, sum=4.604 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.6867671691792295,
          "description": "min=0.687, mean=0.687, max=0.687, sum=0.687 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "medec:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.642,
          "description": "min=0.342, mean=0.642, max=0.849, sum=1.926 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "ehrshot:subject=lab_anemia,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "ehrshot:subject=new_hypertension,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.893,
          "description": "min=0.893, mean=0.893, max=0.893, sum=0.893 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.8116883116883117,
          "description": "min=0.812, mean=0.812, max=0.812, sum=0.812 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "medbullets:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 4.005219985085757,
          "description": "min=4.005, mean=4.005, max=4.005, sum=4.005 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.7990543735224587,
          "description": "min=0.799, mean=0.799, max=0.799, sum=0.799 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.9256830601092896,
          "description": "min=0.926, mean=0.926, max=0.926, sum=0.926 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 4.293000000000009,
          "description": "min=4.293, mean=4.293, max=4.293, sum=4.293 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 4.577777777777778,
          "description": "min=4.578, mean=4.578, max=4.578, sum=4.578 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 4.274305555555555,
          "description": "min=4.274, mean=4.274, max=4.274, sum=4.274 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 4.358888888888886,
          "description": "min=4.359, mean=4.359, max=4.359, sum=4.359 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 3.846666666666666,
          "description": "min=3.847, mean=3.847, max=3.847, sum=3.847 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 4.940908053844398,
          "description": "min=4.941, mean=4.941, max=4.941, sum=4.941 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 4.345266892436706,
          "description": "min=4.345, mean=4.345, max=4.345, sum=4.345 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 4.5807940904893885,
          "description": "min=4.581, mean=4.581, max=4.581, sum=4.581 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 4.227944444444448,
          "description": "min=4.194, mean=4.228, max=4.262, sum=8.456 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "med_dialog,subset=icliniq:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.713,
          "description": "min=0.713, mean=0.713, max=0.713, sum=0.713 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 4.56962962962963,
          "description": "min=4.57, mean=4.57, max=4.57, sum=4.57 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 4.933665008291875,
          "description": "min=4.934, mean=4.934, max=4.934, sum=4.934 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.6966666666666667,
          "description": "min=0.697, mean=0.697, max=0.697, sum=0.697 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.9166666666666666,
          "description": "min=0.917, mean=0.917, max=0.917, sum=0.917 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.743,
          "description": "min=0.743, mean=0.743, max=0.743, sum=0.743 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.272,
          "description": "min=0.272, mean=0.272, max=0.272, sum=0.272 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.8045454545454546,
          "description": "min=0.805, mean=0.805, max=0.805, sum=0.805 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.874251497005988,
          "description": "min=0.874, mean=0.874, max=0.874, sum=0.874 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.8953488372093023,
          "description": "min=0.849, mean=0.895, max=0.93, sum=2.686 (3)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "n2c2_ct_matching:subject=ADVANCED-CAD,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "n2c2_ct_matching:subject=CREATININE,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.896,
          "description": "min=0.896, mean=0.896, max=0.896, sum=0.896 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.784,
          "description": "min=0.784, mean=0.784, max=0.784, sum=0.784 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.24289418050953807,
          "description": "min=0.243, mean=0.243, max=0.243, sum=0.243 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimiciv_billing_code:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.8960244648318043,
          "description": "min=0.896, mean=0.896, max=0.896, sum=0.896 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.559,
          "description": "min=0.559, mean=0.559, max=0.559, sum=0.559 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.546,
          "description": "min=0.546, mean=0.546, max=0.546, sum=0.546 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        }
      ]
    ],
    "links": [
      {
        "text": "LaTeX",
        "href": "benchmark_output/releases/v2.0.0/groups/latex/medhelm_scenarios_accuracy.tex"
      },
      {
        "text": "JSON",
        "href": "benchmark_output/releases/v2.0.0/groups/json/medhelm_scenarios_accuracy.json"
      }
    ],
    "name": "accuracy"
  },
  {
    "title": "Efficiency",
    "header": [
      {
        "value": "Model",
        "markdown": false,
        "metadata": {}
      },
      {
        "value": "Mean win rate",
        "description": "How many models this model outperforms on average (over columns).",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {}
      },
      {
        "value": "MedCalc-Bench - Observed inference time (s)",
        "description": "A dataset which consists of a patient note, a question requesting to compute a specific medical value, and a ground truth answer [(Khandekar et al., 2024)](https://arxiv.org/abs/2406.12036).\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "MedCalc-Bench"
        }
      },
      {
        "value": "CLEAR - Observed inference time (s)",
        "description": "A dataset for evaluating the presence of a specific medical condition from patient notes with yes/no/maybe classifications [(Lopez et al., 2025)](https://www.nature.com/articles/s41746-024-01377-1).\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "CLEAR"
        }
      },
      {
        "value": "MTSamples - Observed inference time (s)",
        "description": "A dataset of clinical notes where the model is prompted to generate the appropriate treatment plan for this patient [(MTSamples, 2025)](https://mtsamples.com).\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "MTSamples"
        }
      },
      {
        "value": "Medec - Observed inference time (s)",
        "description": "A dataset containing medical narratives with error detection and correction pairs [(Abacha et al., 2025)](https://arxiv.org/abs/2412.19260).\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "Medec"
        }
      },
      {
        "value": "EHRSHOT - Observed inference time (s)",
        "description": "A dataset given a patient record of EHR codes, classifying if an event will occur at a future date or not [(Wornow et al., 2023)](https://arxiv.org/abs/2307.02028).\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "EHRSHOT"
        }
      },
      {
        "value": "HeadQA - Observed inference time (s)",
        "description": "A collection of biomedical multiple-choice questions for testing medical knowledge [(Vilares et al., 2019)](https://arxiv.org/abs/1906.04701).\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "HeadQA"
        }
      },
      {
        "value": "Medbullets - Observed inference time (s)",
        "description": "A USMLE-style medical question dataset with multiple-choice answers and explanations [(MedBullets, 2025)](https://step2.medbullets.com).\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "Medbullets"
        }
      },
      {
        "value": "MedAlign - Observed inference time (s)",
        "description": "A dataset that asks models to answer questions/follow instructions over longitudinal EHR [(Fleming et al., 2023)](https://arxiv.org/abs/2308.14089).\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "MedAlign"
        }
      },
      {
        "value": "ADHD-Behavior - Observed inference time (s)",
        "description": "A dataset that classifies whether a clinical note contains a clinician recommendation for parent training in behavior management, which is the first-line evidence-based treatment for young children with ADHD [(Pillai et al., 2024)](https://doi.org/10.1093/jamia/ocae001).\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "ADHD-Behavior"
        }
      },
      {
        "value": "ADHD-MedEffects - Observed inference time (s)",
        "description": "A dataset that classifies whether a clinical note contains documentation of side effect monitoring (recording of absence or presence of medication side effects), as recommended in clinical practice guidelines [(Bannet et al., 2024)](https://doi.org/10.1542/peds.2024-067223).\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "ADHD-MedEffects"
        }
      },
      {
        "value": "DischargeMe - Observed inference time (s)",
        "description": "DischargeMe is a discharge instruction generation dataset and brief hospital course generation dataset collected from MIMIC-IV data, considering only the discharge text as well as the radiology report text [(Xu, 2024)](https://physionet.org/content/discharge-me/1.3/).\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "DischargeMe"
        }
      },
      {
        "value": "ACI-Bench - Observed inference time (s)",
        "description": "A dataset of patient-doctor conversations paired with structured clinical notes [(Yim et al., 2024)](https://www.nature.com/articles/s41597-023-02487-3).\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "ACI-Bench"
        }
      },
      {
        "value": "MTSamples Procedures - Observed inference time (s)",
        "description": "A dataset that provides a patient note regarding an operation, with the objective to document the procedure.\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "MTSamples Procedures"
        }
      },
      {
        "value": "MIMIC-RRS - Observed inference time (s)",
        "description": "A dataset containing radiology reports with findings sections from MIMIC-III paired with their corresponding impression sections, used for generating radiology report summaries [(Chen et al., 2023)](https://arxiv.org/abs/2211.08584).\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "MIMIC-RRS"
        }
      },
      {
        "value": "MIMIC-BHC - Observed inference time (s)",
        "description": "A summarization task using a curated collection of preprocessed discharge notes paired with their corresponding brief hospital course (BHC) summaries [(Aali et al., 2024)](https://doi.org/10.1093/jamia/ocae312).\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "MIMIC-BHC"
        }
      },
      {
        "value": "NoteExtract - Observed inference time (s)",
        "description": "A dataset containing free form text of a clinical health worker care plan, with the associated goal being to restructure that text into a given format.\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "NoteExtract"
        }
      },
      {
        "value": "MedicationQA - Observed inference time (s)",
        "description": "Consumer medication questions with reference answers.\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "MedicationQA"
        }
      },
      {
        "value": "PatientInstruct - Observed inference time (s)",
        "description": "A dataset containing case details used to generate customized post-procedure patient instructions.\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "PatientInstruct"
        }
      },
      {
        "value": "MedDialog - Observed inference time (s)",
        "description": "A collection of doctor-patient conversations with corresponding summaries.\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "MedDialog"
        }
      },
      {
        "value": "MedConfInfo - Observed inference time (s)",
        "description": "A dataset of clinical notes from adolescent patients used to identify sensitive protected health information that should be restricted from parental access [(Rabbani et al., 2024)](https://jamanetwork.com/journals/jamapediatrics/fullarticle/2814109).\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "MedConfInfo"
        }
      },
      {
        "value": "MEDIQA - Observed inference time (s)",
        "description": "A dataset including a medical question, a set of candidate answers, relevance annotations for ranking, and additional context to evaluate understanding and retrieval capabilities in a healthcare setting.\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "MEDIQA"
        }
      },
      {
        "value": "MentalHealth - Observed inference time (s)",
        "description": "A dataset containing a counselor and mental health patient conversation, where the objective is to generate an empathetic counselor response.\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "MentalHealth"
        }
      },
      {
        "value": "ProxySender - Observed inference time (s)",
        "description": "This dataset features messages sent by proxy users and non proxy users, for evaluation of LLM capabilities to determine the sender [(Tse G, et al., 2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "ProxySender"
        }
      },
      {
        "value": "PrivacyDetection - Observed inference time (s)",
        "description": "A dataset that determines if a message leaks any confidential information from the patient [(Tse G, et al., 2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "PrivacyDetection"
        }
      },
      {
        "value": "PubMedQA - Observed inference time (s)",
        "description": "A dataset that provides pubmed abstracts and asks associated questions yes/no/maybe questions.\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "PubMedQA"
        }
      },
      {
        "value": "EHRSQL - Observed inference time (s)",
        "description": "Given a natural language instruction, generate an SQL query that would be used in clinical research.\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "EHRSQL"
        }
      },
      {
        "value": "BMT-Status - Observed inference time (s)",
        "description": "A dataset containing patient notes with associated questions and answers related to bone marrow transplantation.\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "BMT-Status"
        }
      },
      {
        "value": "RaceBias - Observed inference time (s)",
        "description": "A collection of LLM outputs in response to medical questions with race-based biases, with the objective being to classify whether the output contains racially biased content.\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "RaceBias"
        }
      },
      {
        "value": "N2C2-CT - Observed inference time (s)",
        "description": "A dataset that provides clinical notes and asks the model to classify whether the patient is a valid candidate for a provided clinical trial.\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "N2C2-CT"
        }
      },
      {
        "value": "MedHallu - Observed inference time (s)",
        "description": "A dataset of PubMed articles and associated questions, with the objective being to classify whether the answer is factual or hallucinated.\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "MedHallu"
        }
      },
      {
        "value": "HospiceReferral - Observed inference time (s)",
        "description": "A dataset evaluating performance in identifying appropriate patient referrals to hospice care.\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "HospiceReferral"
        }
      },
      {
        "value": "MIMIC-IV Billing Code - Observed inference time (s)",
        "description": "A  dataset pairing clinical notes from MIMIC-IV with corresponding ICD-10 billing codes.\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "MIMIC-IV Billing Code"
        }
      },
      {
        "value": "ClinicReferral - Observed inference time (s)",
        "description": "A dataset containing manually curated answers to questions regarding patient referrals to the Sequoia clinic.\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "ClinicReferral"
        }
      },
      {
        "value": "CDI-QA - Observed inference time (s)",
        "description": "A dataset built from Clinical Document Integrity (CDI) notes, to assess the ability to answer verification questions from previous notes.\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "CDI-QA"
        }
      },
      {
        "value": "ENT-Referral - Observed inference time (s)",
        "description": "A dataset designed to evaluate performance in identifying appropriate patient referrals to Ear, Nose, and Throat specialists.\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "ENT-Referral"
        }
      }
    ],
    "rows": [
      [
        {
          "value": "Claude 3.5 Sonnet (20241022)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.43014705882352944,
          "markdown": false
        },
        {
          "value": 1.3506605696678162,
          "description": "min=1.351, mean=1.351, max=1.351, sum=1.351 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1.707505668594481,
          "description": "min=1.672, mean=1.708, max=1.764, sum=22.198 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=chronic_pain,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=homelessness,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=liver_disease,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=major_depression,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=personality_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=unemployment,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 10.976841152133092,
          "description": "min=10.977, mean=10.977, max=10.977, sum=10.977 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 3.558966633462826,
          "description": "min=3.559, mean=3.559, max=3.559, sum=3.559 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 7.37325446120898,
          "description": "min=6.852, mean=7.373, max=7.856, sum=22.12 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "ehrshot:subject=lab_anemia,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "ehrshot:subject=new_hypertension,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1.3471950261592864,
          "description": "min=1.347, mean=1.347, max=1.347, sum=1.347 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1.3105680098781338,
          "description": "min=1.311, mean=1.311, max=1.311, sum=1.311 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 15.022760760864156,
          "description": "min=15.023, mean=15.023, max=15.023, sum=15.023 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1.6249394027899342,
          "description": "min=1.625, mean=1.625, max=1.625, sum=1.625 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1.794606918585105,
          "description": "min=1.795, mean=1.795, max=1.795, sum=1.795 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 9.33179973578453,
          "description": "min=9.332, mean=9.332, max=9.332, sum=9.332 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 11.376140429576237,
          "description": "min=11.376, mean=11.376, max=11.376, sum=11.376 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 10.682079760357738,
          "description": "min=10.682, mean=10.682, max=10.682, sum=10.682 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 3.1525755443573,
          "description": "min=3.153, mean=3.153, max=3.153, sum=3.153 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 5.536085119247437,
          "description": "min=5.536, mean=5.536, max=5.536, sum=5.536 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 8.081205949646247,
          "description": "min=8.081, mean=8.081, max=8.081, sum=8.081 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 6.5651008826727795,
          "description": "min=6.565, mean=6.565, max=6.565, sum=6.565 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 7.503762823723029,
          "description": "min=7.504, mean=7.504, max=7.504, sum=7.504 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 2.7432067596912386,
          "description": "min=2.721, mean=2.743, max=2.766, sum=5.486 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "med_dialog,subset=icliniq:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 2.208379147768021,
          "description": "min=2.208, mean=2.208, max=2.208, sum=2.208 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 9.511107142766317,
          "description": "min=9.511, mean=9.511, max=9.511, sum=9.511 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 6.4161380796290155,
          "description": "min=6.416, mean=6.416, max=6.416, sum=6.416 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1.543018540541331,
          "description": "min=1.543, mean=1.543, max=1.543, sum=1.543 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1.6228671113650004,
          "description": "min=1.623, mean=1.623, max=1.623, sum=1.623 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1.489382393360138,
          "description": "min=1.489, mean=1.489, max=1.489, sum=1.489 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 4.226262514407818,
          "description": "min=4.226, mean=4.226, max=4.226, sum=4.226 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 2.24667038267309,
          "description": "min=2.247, mean=2.247, max=2.247, sum=2.247 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1.2453928564836878,
          "description": "min=1.245, mean=1.245, max=1.245, sum=1.245 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 3.34105330197386,
          "description": "min=3.265, mean=3.341, max=3.381, sum=10.023 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "n2c2_ct_matching:subject=CREATININE,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 2.388364407300949,
          "description": "min=2.388, mean=2.388, max=2.388, sum=2.388 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 5.993295238733292,
          "description": "min=5.993, mean=5.993, max=5.993, sum=5.993 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "value": 9.196969989607458,
          "description": "min=9.197, mean=9.197, max=9.197, sum=9.197 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 13.668995621919631,
          "description": "min=13.669, mean=13.669, max=13.669, sum=13.669 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 4.761423567295075,
          "description": "min=4.761, mean=4.761, max=4.761, sum=4.761 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        }
      ],
      [
        {
          "value": "Claude 3.7 Sonnet (20250219)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.34558823529411764,
          "markdown": false
        },
        {
          "value": 3.863195901632309,
          "description": "min=3.863, mean=3.863, max=3.863, sum=3.863 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1.5894952763282189,
          "description": "min=1.541, mean=1.589, max=1.627, sum=20.663 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=chronic_pain,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=homelessness,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=liver_disease,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=major_depression,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=personality_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=unemployment,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 13.848624661599723,
          "description": "min=13.849, mean=13.849, max=13.849, sum=13.849 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1.4292833493582566,
          "description": "min=1.429, mean=1.429, max=1.429, sum=1.429 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 7.860006173849105,
          "description": "min=7.014, mean=7.86, max=8.915, sum=23.58 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "ehrshot:subject=lab_anemia,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "ehrshot:subject=new_hypertension,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1.3447579393386841,
          "description": "min=1.345, mean=1.345, max=1.345, sum=1.345 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1.252411273392764,
          "description": "min=1.252, mean=1.252, max=1.252, sum=1.252 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 18.35834754873442,
          "description": "min=18.358, mean=18.358, max=18.358, sum=18.358 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1.6512611312505483,
          "description": "min=1.651, mean=1.651, max=1.651, sum=1.651 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1.7997335014447489,
          "description": "min=1.8, mean=1.8, max=1.8, sum=1.8 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 9.756311689853668,
          "description": "min=9.756, mean=9.756, max=9.756, sum=9.756 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 13.21376874645551,
          "description": "min=13.214, mean=13.214, max=13.214, sum=13.214 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 12.927648959681392,
          "description": "min=12.928, mean=12.928, max=12.928, sum=12.928 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 3.2667968525886537,
          "description": "min=3.267, mean=3.267, max=3.267, sum=3.267 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 6.3847999048233035,
          "description": "min=6.385, mean=6.385, max=6.385, sum=6.385 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 5.383256822640891,
          "description": "min=5.383, mean=5.383, max=5.383, sum=5.383 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 8.33733369269461,
          "description": "min=8.337, mean=8.337, max=8.337, sum=8.337 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 8.463370934087484,
          "description": "min=8.463, mean=8.463, max=8.463, sum=8.463 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 3.1681859172582625,
          "description": "min=3.022, mean=3.168, max=3.314, sum=6.336 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "med_dialog,subset=icliniq:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 2.0782204871177674,
          "description": "min=2.078, mean=2.078, max=2.078, sum=2.078 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 11.665022039413453,
          "description": "min=11.665, mean=11.665, max=11.665, sum=11.665 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 7.703489143456986,
          "description": "min=7.703, mean=7.703, max=7.703, sum=7.703 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1.765497265656789,
          "description": "min=1.765, mean=1.765, max=1.765, sum=1.765 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 2.0104193170865376,
          "description": "min=2.01, mean=2.01, max=2.01, sum=2.01 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 2.4872676334381105,
          "description": "min=2.487, mean=2.487, max=2.487, sum=2.487 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 8.11839797091484,
          "description": "min=8.118, mean=8.118, max=8.118, sum=8.118 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 2.2075942852280357,
          "description": "min=2.208, mean=2.208, max=2.208, sum=2.208 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 2.5685647547601937,
          "description": "min=2.569, mean=2.569, max=2.569, sum=2.569 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 3.0745366978090867,
          "description": "min=3.06, mean=3.075, max=3.085, sum=9.224 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "n2c2_ct_matching:subject=CREATININE,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 4.534155822515488,
          "description": "min=4.534, mean=4.534, max=4.534, sum=4.534 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 6.536557952165603,
          "description": "min=6.537, mean=6.537, max=6.537, sum=6.537 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "value": 10.019044827248344,
          "description": "min=10.019, mean=10.019, max=10.019, sum=10.019 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 16.081016157627104,
          "description": "min=16.081, mean=16.081, max=16.081, sum=16.081 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 4.905725754976273,
          "description": "min=4.906, mean=4.906, max=4.906, sum=4.906 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        }
      ],
      [
        {
          "value": "DeepSeek R1",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.01838235294117647,
          "markdown": false
        },
        {
          "value": 43.75286227345467,
          "description": "min=43.753, mean=43.753, max=43.753, sum=43.753 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 5.571368995212822,
          "description": "min=3.961, mean=5.571, max=7.31, sum=72.428 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=bipolar_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=chronic_pain,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=homelessness,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=liver_disease,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=major_depression,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=personality_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=substance_use_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=suicidal_behavior,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=tobacco_dependence,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=unemployment,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 29.774344934014714,
          "description": "min=29.774, mean=29.774, max=29.774, sum=29.774 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 41.87717197728117,
          "description": "min=41.877, mean=41.877, max=41.877, sum=41.877 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 32.238988775283346,
          "description": "min=29.453, mean=32.239, max=34.805, sum=96.717 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "ehrshot:subject=lab_anemia,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "ehrshot:subject=new_hypertension,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 20.78036990451813,
          "description": "min=20.78, mean=20.78, max=20.78, sum=20.78 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 34.088611539308125,
          "description": "min=34.089, mean=34.089, max=34.089, sum=34.089 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 55.19278714320804,
          "description": "min=55.193, mean=55.193, max=55.193, sum=55.193 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 3.7312907789334058,
          "description": "min=3.731, mean=3.731, max=3.731, sum=3.731 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 4.9966402445110765,
          "description": "min=4.997, mean=4.997, max=4.997, sum=4.997 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 14.265671773433684,
          "description": "min=14.266, mean=14.266, max=14.266, sum=14.266 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 15.778188250462215,
          "description": "min=15.778, mean=15.778, max=15.778, sum=15.778 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 21.84207220003009,
          "description": "min=21.842, mean=21.842, max=21.842, sum=21.842 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 12.422235173121615,
          "description": "min=12.422, mean=12.422, max=12.422, sum=12.422 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 8.098028691128047,
          "description": "min=8.098, mean=8.098, max=8.098, sum=8.098 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 29.56650953713873,
          "description": "min=29.567, mean=29.567, max=29.567, sum=29.567 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 21.51520863188521,
          "description": "min=21.515, mean=21.515, max=21.515, sum=21.515 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 21.370546659274115,
          "description": "min=21.371, mean=21.371, max=21.371, sum=21.371 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 8.633845594762542,
          "description": "min=7.447, mean=8.634, max=9.82, sum=17.268 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "med_dialog,subset=icliniq:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 4.726435486793518,
          "description": "min=4.726, mean=4.726, max=4.726, sum=4.726 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 21.951781096458436,
          "description": "min=21.952, mean=21.952, max=21.952, sum=21.952 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 16.309963222759873,
          "description": "min=16.31, mean=16.31, max=16.31, sum=16.31 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 5.393800741036733,
          "description": "min=5.394, mean=5.394, max=5.394, sum=5.394 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 3.011624131202698,
          "description": "min=3.012, mean=3.012, max=3.012, sum=3.012 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 8.807464688062668,
          "description": "min=8.807, mean=8.807, max=8.807, sum=8.807 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 19.193029651880263,
          "description": "min=19.193, mean=19.193, max=19.193, sum=19.193 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 4.999103357575157,
          "description": "min=4.999, mean=4.999, max=4.999, sum=4.999 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 7.5093938205056565,
          "description": "min=7.509, mean=7.509, max=7.509, sum=7.509 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 15.904167234127527,
          "description": "min=11.84, mean=15.904, max=23.456, sum=47.713 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "n2c2_ct_matching:subject=ADVANCED-CAD,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "n2c2_ct_matching:subject=CREATININE,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 6.629145324707031,
          "description": "min=6.629, mean=6.629, max=6.629, sum=6.629 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 14.038371870109627,
          "description": "min=14.038, mean=14.038, max=14.038, sum=14.038 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "value": 18.646856945224492,
          "description": "min=18.647, mean=18.647, max=18.647, sum=18.647 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 31.498115318775177,
          "description": "min=31.498, mean=31.498, max=31.498, sum=31.498 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 8.433589814662934,
          "description": "min=8.434, mean=8.434, max=8.434, sum=8.434 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        }
      ],
      [
        {
          "value": "Gemini 1.5 Pro (001)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.5073529411764706,
          "markdown": false
        },
        {
          "value": 0.5846519210338592,
          "description": "min=0.585, mean=0.585, max=0.585, sum=0.585 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 2.112702348308752,
          "description": "min=2.039, mean=2.113, max=2.412, sum=27.465 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=chronic_pain,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=homelessness,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=liver_disease,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=major_depression,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=personality_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=unemployment,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 13.974278546049668,
          "description": "min=13.974, mean=13.974, max=13.974, sum=13.974 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.9463922046936137,
          "description": "min=0.946, mean=0.946, max=0.946, sum=0.946 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 4.764238543669383,
          "description": "min=4.487, mean=4.764, max=5.091, sum=14.293 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "ehrshot:subject=lab_anemia,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "ehrshot:subject=new_hypertension,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.7839107191562653,
          "description": "min=0.784, mean=0.784, max=0.784, sum=0.784 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.5115113142248872,
          "description": "min=0.512, mean=0.512, max=0.512, sum=0.512 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 10.494709195706669,
          "description": "min=10.495, mean=10.495, max=10.495, sum=10.495 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1.9810318197200767,
          "description": "min=1.981, mean=1.981, max=1.981, sum=1.981 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1.9988630597057238,
          "description": "min=1.999, mean=1.999, max=1.999, sum=1.999 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 7.394040625572204,
          "description": "min=7.394, mean=7.394, max=7.394, sum=7.394 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 11.991099919875463,
          "description": "min=11.991, mean=11.991, max=11.991, sum=11.991 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 7.07927854731679,
          "description": "min=7.079, mean=7.079, max=7.079, sum=7.079 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 3.4135646600723266,
          "description": "min=3.414, mean=3.414, max=3.414, sum=3.414 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 3.7724135589599608,
          "description": "min=3.772, mean=3.772, max=3.772, sum=3.772 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 4.6515466530465,
          "description": "min=4.652, mean=4.652, max=4.652, sum=4.652 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 6.1594320007955385,
          "description": "min=6.159, mean=6.159, max=6.159, sum=6.159 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 7.228144612668954,
          "description": "min=7.228, mean=7.228, max=7.228, sum=7.228 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 2.9894730966091156,
          "description": "min=2.966, mean=2.989, max=3.013, sum=5.979 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "med_dialog,subset=icliniq:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 2.17378666639328,
          "description": "min=2.174, mean=2.174, max=2.174, sum=2.174 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 8.410743509928386,
          "description": "min=8.411, mean=8.411, max=8.411, sum=8.411 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 3.906367454955827,
          "description": "min=3.906, mean=3.906, max=3.906, sum=3.906 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 2.3673279166221617,
          "description": "min=2.367, mean=2.367, max=2.367, sum=2.367 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 2.226118216514587,
          "description": "min=2.226, mean=2.226, max=2.226, sum=2.226 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.5072131760120392,
          "description": "min=0.507, mean=0.507, max=0.507, sum=0.507 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1.806319487810135,
          "description": "min=1.806, mean=1.806, max=1.806, sum=1.806 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 2.147076643596996,
          "description": "min=2.147, mean=2.147, max=2.147, sum=2.147 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.5023995673585081,
          "description": "min=0.502, mean=0.502, max=0.502, sum=0.502 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 2.7554199520007585,
          "description": "min=2.722, mean=2.755, max=2.792, sum=8.266 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "n2c2_ct_matching:subject=CREATININE,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.5319631929397584,
          "description": "min=0.532, mean=0.532, max=0.532, sum=0.532 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 6.904875324964523,
          "description": "min=6.905, mean=6.905, max=6.905, sum=6.905 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "value": 5.151954744204834,
          "description": "min=5.152, mean=5.152, max=5.152, sum=5.152 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 14.265551701307297,
          "description": "min=14.266, mean=14.266, max=14.266, sum=14.266 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 3.5295421197414396,
          "description": "min=3.53, mean=3.53, max=3.53, sum=3.53 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        }
      ],
      [
        {
          "value": "Gemini 2.0 Flash",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.7647058823529411,
          "markdown": false
        },
        {
          "value": 0.38494687390327453,
          "description": "min=0.385, mean=0.385, max=0.385, sum=0.385 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 1.916397778879627,
          "description": "min=1.878, mean=1.916, max=2.008, sum=24.913 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=chronic_pain,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=homelessness,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=liver_disease,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=major_depression,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=personality_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=unemployment,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 5.227254615176198,
          "description": "min=5.227, mean=5.227, max=5.227, sum=5.227 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.4693439094664863,
          "description": "min=0.469, mean=0.469, max=0.469, sum=0.469 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 3.30513532336553,
          "description": "min=3.199, mean=3.305, max=3.479, sum=9.915 (3)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "ehrshot:subject=lab_anemia,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "ehrshot:subject=new_hypertension,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.33297129392623903,
          "description": "min=0.333, mean=0.333, max=0.333, sum=0.333 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.33868198038695696,
          "description": "min=0.339, mean=0.339, max=0.339, sum=0.339 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 5.251175875631755,
          "description": "min=5.251, mean=5.251, max=5.251, sum=5.251 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "medalign:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 1.8757229600955971,
          "description": "min=1.876, mean=1.876, max=1.876, sum=1.876 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 1.877303554190964,
          "description": "min=1.877, mean=1.877, max=1.877, sum=1.877 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 3.683770047426224,
          "description": "min=3.684, mean=3.684, max=3.684, sum=3.684 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 4.9915327548980715,
          "description": "min=4.992, mean=4.992, max=4.992, sum=4.992 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 5.109029924497008,
          "description": "min=5.109, mean=5.109, max=5.109, sum=5.109 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 2.4756844749450684,
          "description": "min=2.476, mean=2.476, max=2.476, sum=2.476 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 2.571005198955536,
          "description": "min=2.571, mean=2.571, max=2.571, sum=2.571 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 3.346397649580938,
          "description": "min=3.346, mean=3.346, max=3.346, sum=3.346 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 3.843229794190826,
          "description": "min=3.843, mean=3.843, max=3.843, sum=3.843 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 3.8159013593626154,
          "description": "min=3.816, mean=3.816, max=3.816, sum=3.816 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 2.359554039001465,
          "description": "min=2.326, mean=2.36, max=2.393, sum=4.719 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "med_dialog,subset=icliniq:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 2.023589251279831,
          "description": "min=2.024, mean=2.024, max=2.024, sum=2.024 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 5.9853372367223105,
          "description": "min=5.985, mean=5.985, max=5.985, sum=5.985 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 2.684903842299732,
          "description": "min=2.685, mean=2.685, max=2.685, sum=2.685 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 2.35461799065272,
          "description": "min=2.355, mean=2.355, max=2.355, sum=2.355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 2.3275622653961183,
          "description": "min=2.328, mean=2.328, max=2.328, sum=2.328 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.3441569275856018,
          "description": "min=0.344, mean=0.344, max=0.344, sum=0.344 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.7652456421852112,
          "description": "min=0.765, mean=0.765, max=0.765, sum=0.765 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 1.9761667815121737,
          "description": "min=1.976, mean=1.976, max=1.976, sum=1.976 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.40159591086610347,
          "description": "min=0.402, mean=0.402, max=0.402, sum=0.402 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 2.5136772468108544,
          "description": "min=2.481, mean=2.514, max=2.551, sum=7.541 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "n2c2_ct_matching:subject=CREATININE,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.33655450963974,
          "description": "min=0.337, mean=0.337, max=0.337, sum=0.337 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 3.136566119670868,
          "description": "min=3.137, mean=3.137, max=3.137, sum=3.137 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "value": 3.85423586055044,
          "description": "min=3.854, mean=3.854, max=3.854, sum=3.854 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 4.905799955368042,
          "description": "min=4.906, mean=4.906, max=4.906, sum=4.906 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 2.7608540432453155,
          "description": "min=2.761, mean=2.761, max=2.761, sum=2.761 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        }
      ],
      [
        {
          "value": "GPT-4o (2024-05-13)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.75,
          "markdown": false
        },
        {
          "value": 1.2598307481160513,
          "description": "min=1.26, mean=1.26, max=1.26, sum=1.26 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.4909083424839861,
          "description": "min=0.334, mean=0.491, max=1.079, sum=6.382 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=chronic_pain,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=homelessness,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=liver_disease,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=major_depression,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=personality_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=unemployment,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 4.362946756587309,
          "description": "min=4.363, mean=4.363, max=4.363, sum=4.363 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.2239204533840886,
          "description": "min=1.224, mean=1.224, max=1.224, sum=1.224 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 29.547294229984285,
          "description": "min=27.229, mean=29.547, max=31.257, sum=88.642 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "ehrshot:subject=lab_anemia,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "ehrshot:subject=new_hypertension,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.36129777812957764,
          "description": "min=0.361, mean=0.361, max=0.361, sum=0.361 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.3442080895622055,
          "description": "min=0.344, mean=0.344, max=0.344, sum=0.344 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 45.68971297084885,
          "description": "min=45.69, mean=45.69, max=45.69, sum=45.69 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.42209514151228233,
          "description": "min=0.422, mean=0.422, max=0.422, sum=0.422 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.4368795751873913,
          "description": "min=0.437, mean=0.437, max=0.437, sum=0.437 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 5.880889887332916,
          "description": "min=5.881, mean=5.881, max=5.881, sum=5.881 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 5.898902521530787,
          "description": "min=5.899, mean=5.899, max=5.899, sum=5.899 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 9.548104664310813,
          "description": "min=9.548, mean=9.548, max=9.548, sum=9.548 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.6024561272612934,
          "description": "min=1.602, mean=1.602, max=1.602, sum=1.602 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 2.68542049407959,
          "description": "min=2.685, mean=2.685, max=2.685, sum=2.685 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 2.600984902352523,
          "description": "min=2.601, mean=2.601, max=2.601, sum=2.601 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 2.896803928217452,
          "description": "min=2.897, mean=2.897, max=2.897, sum=2.897 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 4.660049803039043,
          "description": "min=4.66, mean=4.66, max=4.66, sum=4.66 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.8463265186207465,
          "description": "min=0.823, mean=0.846, max=0.87, sum=1.693 (2)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "med_dialog,subset=icliniq:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.5284792273044586,
          "description": "min=0.528, mean=0.528, max=0.528, sum=0.528 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 4.640198082923889,
          "description": "min=4.64, mean=4.64, max=4.64, sum=4.64 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.2639549596985775,
          "description": "min=1.264, mean=1.264, max=1.264, sum=1.264 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.6905542230606079,
          "description": "min=0.691, mean=0.691, max=0.691, sum=0.691 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.6653454645474752,
          "description": "min=0.665, mean=0.665, max=0.665, sum=0.665 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 8.718091148138047,
          "description": "min=8.718, mean=8.718, max=8.718, sum=8.718 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 3.8335974130630492,
          "description": "min=3.834, mean=3.834, max=3.834, sum=3.834 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.4940158518877896,
          "description": "min=0.494, mean=0.494, max=0.494, sum=0.494 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.32343775355173443,
          "description": "min=0.323, mean=0.323, max=0.323, sum=0.323 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 5.889826661856599,
          "description": "min=5.881, mean=5.89, max=5.903, sum=17.669 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "n2c2_ct_matching:subject=CREATININE,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.6999403750896454,
          "description": "min=1.7, mean=1.7, max=1.7, sum=1.7 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 2.2031820595264433,
          "description": "min=2.203, mean=2.203, max=2.203, sum=2.203 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "value": 3.7981141481195384,
          "description": "min=3.798, mean=3.798, max=3.798, sum=3.798 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 5.85838643527031,
          "description": "min=5.858, mean=5.858, max=5.858, sum=5.858 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.5592270965576172,
          "description": "min=1.559, mean=1.559, max=1.559, sum=1.559 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        }
      ],
      [
        {
          "value": "GPT-4o mini (2024-07-18)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.7830882352941176,
          "style": {
            "font-weight": "bold"
          },
          "markdown": false
        },
        {
          "value": 0.5731263158318994,
          "description": "min=0.573, mean=0.573, max=0.573, sum=0.573 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.8804333776680175,
          "description": "min=0.585, mean=0.88, max=1.126, sum=11.446 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=chronic_pain,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=homelessness,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=liver_disease,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=major_depression,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=personality_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=unemployment,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 4.75874111960916,
          "description": "min=4.759, mean=4.759, max=4.759, sum=4.759 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.8237213220726065,
          "description": "min=0.824, mean=0.824, max=0.824, sum=0.824 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 29.851671765327453,
          "description": "min=27.479, mean=29.852, max=31.687, sum=89.555 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "ehrshot:subject=lab_anemia,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "ehrshot:subject=new_hypertension,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.5146426157951355,
          "description": "min=0.515, mean=0.515, max=0.515, sum=0.515 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.5177228698482761,
          "description": "min=0.518, mean=0.518, max=0.518, sum=0.518 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 46.25818750042243,
          "description": "min=46.258, mean=46.258, max=46.258, sum=46.258 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.5856102826183851,
          "description": "min=0.586, mean=0.586, max=0.586, sum=0.586 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.576928517857536,
          "description": "min=0.577, mean=0.577, max=0.577, sum=0.577 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 5.917445467233658,
          "description": "min=5.917, mean=5.917, max=5.917, sum=5.917 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 4.0863512297471365,
          "description": "min=4.086, mean=4.086, max=4.086, sum=4.086 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 5.294802710413933,
          "description": "min=5.295, mean=5.295, max=5.295, sum=5.295 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.257995302164102,
          "description": "min=1.258, mean=1.258, max=1.258, sum=1.258 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.5203437304496765,
          "description": "min=1.52, mean=1.52, max=1.52, sum=1.52 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 2.092606608627758,
          "description": "min=2.093, mean=2.093, max=2.093, sum=2.093 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 2.111854866384941,
          "description": "min=2.112, mean=2.112, max=2.112, sum=2.112 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 4.490330929241022,
          "description": "min=4.49, mean=4.49, max=4.49, sum=4.49 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.8974741188484632,
          "description": "min=0.896, mean=0.897, max=0.899, sum=1.795 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "med_dialog,subset=icliniq:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.5781944992542267,
          "description": "min=0.578, mean=0.578, max=0.578, sum=0.578 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 3.2412102206548057,
          "description": "min=3.241, mean=3.241, max=3.241, sum=3.241 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.3065443999731718,
          "description": "min=1.307, mean=1.307, max=1.307, sum=1.307 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.66272509654363,
          "description": "min=0.663, mean=0.663, max=0.663, sum=0.663 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.7041426730155945,
          "description": "min=0.704, mean=0.704, max=0.704, sum=0.704 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.6166051857471466,
          "description": "min=0.617, mean=0.617, max=0.617, sum=0.617 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.0685200524330138,
          "description": "min=1.069, mean=1.069, max=1.069, sum=1.069 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.6242298223755577,
          "description": "min=0.624, mean=0.624, max=0.624, sum=0.624 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.4917304216030829,
          "description": "min=0.492, mean=0.492, max=0.492, sum=0.492 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 8.133637910665469,
          "description": "min=7.261, mean=8.134, max=8.6, sum=24.401 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "n2c2_ct_matching:subject=CREATININE,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.6268642969131469,
          "description": "min=0.627, mean=0.627, max=0.627, sum=0.627 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 2.053197555780411,
          "description": "min=2.053, mean=2.053, max=2.053, sum=2.053 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "value": 3.2714920969913495,
          "description": "min=3.271, mean=3.271, max=3.271, sum=3.271 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 5.771784316778183,
          "description": "min=5.772, mean=5.772, max=5.772, sum=5.772 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.463045666217804,
          "description": "min=1.463, mean=1.463, max=1.463, sum=1.463 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        }
      ],
      [
        {
          "value": "Llama 3.3 Instruct (70B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.6286764705882353,
          "markdown": false
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.2965207255424805,
          "description": "min=0.258, mean=0.297, max=0.34, sum=3.855 (13)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=chronic_pain,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=homelessness,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=liver_disease,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=major_depression,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=personality_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=unemployment,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 9.148186227013083,
          "description": "min=9.148, mean=9.148, max=9.148, sum=9.148 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "medec:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 10.951756122191748,
          "description": "min=9.876, mean=10.952, max=11.654, sum=32.855 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "ehrshot:subject=lab_anemia,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "ehrshot:subject=new_hypertension,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 23.10033937908659,
          "description": "min=23.1, mean=23.1, max=23.1, sum=23.1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 3.6418433290846806,
          "description": "min=3.642, mean=3.642, max=3.642, sum=3.642 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 2.073261349579024,
          "description": "min=2.073, mean=2.073, max=2.073, sum=2.073 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 7.064391430616379,
          "description": "min=7.064, mean=7.064, max=7.064, sum=7.064 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 11.603488028049469,
          "description": "min=11.603, mean=11.603, max=11.603, sum=11.603 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 9.823013886809349,
          "description": "min=9.823, mean=9.823, max=9.823, sum=9.823 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 5.954605188125219,
          "description": "min=5.955, mean=5.955, max=5.955, sum=5.955 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 2.7478469467163085,
          "description": "min=2.748, mean=2.748, max=2.748, sum=2.748 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 3.634354764675947,
          "description": "min=3.634, mean=3.634, max=3.634, sum=3.634 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 5.604794421631989,
          "description": "min=5.605, mean=5.605, max=5.605, sum=5.605 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 6.200638982397698,
          "description": "min=6.201, mean=6.201, max=6.201, sum=6.201 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1.3253600368313856,
          "description": "min=1.291, mean=1.325, max=1.359, sum=2.651 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "med_dialog,subset=icliniq:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1.248099797487259,
          "description": "min=1.248, mean=1.248, max=1.248, sum=1.248 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 8.413016567230224,
          "description": "min=8.413, mean=8.413, max=8.413, sum=8.413 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 2.180056931367561,
          "description": "min=2.18, mean=2.18, max=2.18, sum=2.18 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.2374761176109314,
          "description": "min=0.237, mean=0.237, max=0.237, sum=0.237 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.2110820166269938,
          "description": "min=0.211, mean=0.211, max=0.211, sum=0.211 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1.2009574868462303,
          "description": "min=1.201, mean=1.201, max=1.201, sum=1.201 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1.2071963594865427,
          "description": "min=1.117, mean=1.207, max=1.374, sum=3.622 (3)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "n2c2_ct_matching:subject=CREATININE,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 14.256210972547532,
          "description": "min=14.256, mean=14.256, max=14.256, sum=14.256 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "value": 24.56117909253555,
          "description": "min=24.561, mean=24.561, max=24.561, sum=24.561 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 37.005855027914045,
          "description": "min=37.006, mean=37.006, max=37.006, sum=37.006 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 9.610506303310395,
          "description": "min=9.611, mean=9.611, max=9.611, sum=9.611 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        }
      ],
      [
        {
          "value": "o3-mini (2025-01-31)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.27205882352941174,
          "markdown": false
        },
        {
          "value": 12.23940966938351,
          "description": "min=12.239, mean=12.239, max=12.239, sum=12.239 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 2.478334377223607,
          "description": "min=2.001, mean=2.478, max=2.853, sum=32.218 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=bipolar_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=chronic_pain,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=homelessness,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=liver_disease,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=major_depression,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=personality_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=substance_use_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=suicidal_behavior,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=tobacco_dependence,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=unemployment,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 9.537268293044146,
          "description": "min=9.537, mean=9.537, max=9.537, sum=9.537 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 17.57311107540453,
          "description": "min=17.573, mean=17.573, max=17.573, sum=17.573 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 22.073866986036297,
          "description": "min=9.261, mean=22.074, max=40.147, sum=66.222 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "ehrshot:subject=lab_anemia,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "ehrshot:subject=new_hypertension,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 3.9501672561168673,
          "description": "min=3.95, mean=3.95, max=3.95, sum=3.95 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 7.292820972281617,
          "description": "min=7.293, mean=7.293, max=7.293, sum=7.293 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 48.5786174415742,
          "description": "min=48.579, mean=48.579, max=48.579, sum=48.579 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 2.750029217266867,
          "description": "min=2.75, mean=2.75, max=2.75, sum=2.75 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 2.5139477516132627,
          "description": "min=2.514, mean=2.514, max=2.514, sum=2.514 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 10.002931237220764,
          "description": "min=10.003, mean=10.003, max=10.003, sum=10.003 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 6.996356503168742,
          "description": "min=6.996, mean=6.996, max=6.996, sum=6.996 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 9.186064189299941,
          "description": "min=9.186, mean=9.186, max=9.186, sum=9.186 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 4.547026488601521,
          "description": "min=4.547, mean=4.547, max=4.547, sum=4.547 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 4.090529644128048,
          "description": "min=4.091, mean=4.091, max=4.091, sum=4.091 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 10.660015441064228,
          "description": "min=10.66, mean=10.66, max=10.66, sum=10.66 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 7.778176902859581,
          "description": "min=7.778, mean=7.778, max=7.778, sum=7.778 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 12.239962604237395,
          "description": "min=12.24, mean=12.24, max=12.24, sum=12.24 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 3.500650955711766,
          "description": "min=3.271, mean=3.501, max=3.731, sum=7.001 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "med_dialog,subset=icliniq:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 2.8022545745372773,
          "description": "min=2.802, mean=2.802, max=2.802, sum=2.802 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 7.404716361363729,
          "description": "min=7.405, mean=7.405, max=7.405, sum=7.405 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 5.80801716135509,
          "description": "min=5.808, mean=5.808, max=5.808, sum=5.808 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 5.142877056598663,
          "description": "min=5.143, mean=5.143, max=5.143, sum=5.143 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 3.781061518192291,
          "description": "min=3.781, mean=3.781, max=3.781, sum=3.781 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 2.3193285653591156,
          "description": "min=2.319, mean=2.319, max=2.319, sum=2.319 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 9.879440756797791,
          "description": "min=9.879, mean=9.879, max=9.879, sum=9.879 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 3.363296365737915,
          "description": "min=3.363, mean=3.363, max=3.363, sum=3.363 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 6.315420240699174,
          "description": "min=6.315, mean=6.315, max=6.315, sum=6.315 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 8.013198332269063,
          "description": "min=6.503, mean=8.013, max=10.304, sum=24.04 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "n2c2_ct_matching:subject=ADVANCED-CAD,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "n2c2_ct_matching:subject=CREATININE,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 5.188710170269013,
          "description": "min=5.189, mean=5.189, max=5.189, sum=5.189 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 4.608473005533218,
          "description": "min=4.608, mean=4.608, max=4.608, sum=4.608 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "value": 5.290066151808526,
          "description": "min=5.29, mean=5.29, max=5.29, sum=5.29 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 12.857608857393265,
          "description": "min=12.858, mean=12.858, max=12.858, sum=12.858 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 4.357393162965774,
          "description": "min=4.357, mean=4.357, max=4.357, sum=4.357 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        }
      ]
    ],
    "links": [
      {
        "text": "LaTeX",
        "href": "benchmark_output/releases/v2.0.0/groups/latex/medhelm_scenarios_efficiency.tex"
      },
      {
        "text": "JSON",
        "href": "benchmark_output/releases/v2.0.0/groups/json/medhelm_scenarios_efficiency.json"
      }
    ],
    "name": "efficiency"
  },
  {
    "title": "General information",
    "header": [
      {
        "value": "Model",
        "markdown": false,
        "metadata": {}
      },
      {
        "value": "MedCalc-Bench - # eval",
        "description": "A dataset which consists of a patient note, a question requesting to compute a specific medical value, and a ground truth answer [(Khandekar et al., 2024)](https://arxiv.org/abs/2406.12036).\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "MedCalc-Bench"
        }
      },
      {
        "value": "MedCalc-Bench - # train",
        "description": "A dataset which consists of a patient note, a question requesting to compute a specific medical value, and a ground truth answer [(Khandekar et al., 2024)](https://arxiv.org/abs/2406.12036).\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "MedCalc-Bench"
        }
      },
      {
        "value": "MedCalc-Bench - truncated",
        "description": "A dataset which consists of a patient note, a question requesting to compute a specific medical value, and a ground truth answer [(Khandekar et al., 2024)](https://arxiv.org/abs/2406.12036).\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "MedCalc-Bench"
        }
      },
      {
        "value": "MedCalc-Bench - # prompt tokens",
        "description": "A dataset which consists of a patient note, a question requesting to compute a specific medical value, and a ground truth answer [(Khandekar et al., 2024)](https://arxiv.org/abs/2406.12036).\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "MedCalc-Bench"
        }
      },
      {
        "value": "MedCalc-Bench - # output tokens",
        "description": "A dataset which consists of a patient note, a question requesting to compute a specific medical value, and a ground truth answer [(Khandekar et al., 2024)](https://arxiv.org/abs/2406.12036).\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "MedCalc-Bench"
        }
      },
      {
        "value": "CLEAR - # eval",
        "description": "A dataset for evaluating the presence of a specific medical condition from patient notes with yes/no/maybe classifications [(Lopez et al., 2025)](https://www.nature.com/articles/s41746-024-01377-1).\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "CLEAR"
        }
      },
      {
        "value": "CLEAR - # train",
        "description": "A dataset for evaluating the presence of a specific medical condition from patient notes with yes/no/maybe classifications [(Lopez et al., 2025)](https://www.nature.com/articles/s41746-024-01377-1).\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "CLEAR"
        }
      },
      {
        "value": "CLEAR - truncated",
        "description": "A dataset for evaluating the presence of a specific medical condition from patient notes with yes/no/maybe classifications [(Lopez et al., 2025)](https://www.nature.com/articles/s41746-024-01377-1).\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "CLEAR"
        }
      },
      {
        "value": "CLEAR - # prompt tokens",
        "description": "A dataset for evaluating the presence of a specific medical condition from patient notes with yes/no/maybe classifications [(Lopez et al., 2025)](https://www.nature.com/articles/s41746-024-01377-1).\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "CLEAR"
        }
      },
      {
        "value": "CLEAR - # output tokens",
        "description": "A dataset for evaluating the presence of a specific medical condition from patient notes with yes/no/maybe classifications [(Lopez et al., 2025)](https://www.nature.com/articles/s41746-024-01377-1).\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "CLEAR"
        }
      },
      {
        "value": "MTSamples - # eval",
        "description": "A dataset of clinical notes where the model is prompted to generate the appropriate treatment plan for this patient [(MTSamples, 2025)](https://mtsamples.com).\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "MTSamples"
        }
      },
      {
        "value": "MTSamples - # train",
        "description": "A dataset of clinical notes where the model is prompted to generate the appropriate treatment plan for this patient [(MTSamples, 2025)](https://mtsamples.com).\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "MTSamples"
        }
      },
      {
        "value": "MTSamples - truncated",
        "description": "A dataset of clinical notes where the model is prompted to generate the appropriate treatment plan for this patient [(MTSamples, 2025)](https://mtsamples.com).\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "MTSamples"
        }
      },
      {
        "value": "MTSamples - # prompt tokens",
        "description": "A dataset of clinical notes where the model is prompted to generate the appropriate treatment plan for this patient [(MTSamples, 2025)](https://mtsamples.com).\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "MTSamples"
        }
      },
      {
        "value": "MTSamples - # output tokens",
        "description": "A dataset of clinical notes where the model is prompted to generate the appropriate treatment plan for this patient [(MTSamples, 2025)](https://mtsamples.com).\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "MTSamples"
        }
      },
      {
        "value": "Medec - # eval",
        "description": "A dataset containing medical narratives with error detection and correction pairs [(Abacha et al., 2025)](https://arxiv.org/abs/2412.19260).\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "Medec"
        }
      },
      {
        "value": "Medec - # train",
        "description": "A dataset containing medical narratives with error detection and correction pairs [(Abacha et al., 2025)](https://arxiv.org/abs/2412.19260).\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "Medec"
        }
      },
      {
        "value": "Medec - truncated",
        "description": "A dataset containing medical narratives with error detection and correction pairs [(Abacha et al., 2025)](https://arxiv.org/abs/2412.19260).\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "Medec"
        }
      },
      {
        "value": "Medec - # prompt tokens",
        "description": "A dataset containing medical narratives with error detection and correction pairs [(Abacha et al., 2025)](https://arxiv.org/abs/2412.19260).\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "Medec"
        }
      },
      {
        "value": "Medec - # output tokens",
        "description": "A dataset containing medical narratives with error detection and correction pairs [(Abacha et al., 2025)](https://arxiv.org/abs/2412.19260).\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "Medec"
        }
      },
      {
        "value": "EHRSHOT - # eval",
        "description": "A dataset given a patient record of EHR codes, classifying if an event will occur at a future date or not [(Wornow et al., 2023)](https://arxiv.org/abs/2307.02028).\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "EHRSHOT"
        }
      },
      {
        "value": "EHRSHOT - # train",
        "description": "A dataset given a patient record of EHR codes, classifying if an event will occur at a future date or not [(Wornow et al., 2023)](https://arxiv.org/abs/2307.02028).\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "EHRSHOT"
        }
      },
      {
        "value": "EHRSHOT - truncated",
        "description": "A dataset given a patient record of EHR codes, classifying if an event will occur at a future date or not [(Wornow et al., 2023)](https://arxiv.org/abs/2307.02028).\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "EHRSHOT"
        }
      },
      {
        "value": "EHRSHOT - # prompt tokens",
        "description": "A dataset given a patient record of EHR codes, classifying if an event will occur at a future date or not [(Wornow et al., 2023)](https://arxiv.org/abs/2307.02028).\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "EHRSHOT"
        }
      },
      {
        "value": "EHRSHOT - # output tokens",
        "description": "A dataset given a patient record of EHR codes, classifying if an event will occur at a future date or not [(Wornow et al., 2023)](https://arxiv.org/abs/2307.02028).\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "EHRSHOT"
        }
      },
      {
        "value": "HeadQA - # eval",
        "description": "A collection of biomedical multiple-choice questions for testing medical knowledge [(Vilares et al., 2019)](https://arxiv.org/abs/1906.04701).\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "HeadQA"
        }
      },
      {
        "value": "HeadQA - # train",
        "description": "A collection of biomedical multiple-choice questions for testing medical knowledge [(Vilares et al., 2019)](https://arxiv.org/abs/1906.04701).\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "HeadQA"
        }
      },
      {
        "value": "HeadQA - truncated",
        "description": "A collection of biomedical multiple-choice questions for testing medical knowledge [(Vilares et al., 2019)](https://arxiv.org/abs/1906.04701).\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "HeadQA"
        }
      },
      {
        "value": "HeadQA - # prompt tokens",
        "description": "A collection of biomedical multiple-choice questions for testing medical knowledge [(Vilares et al., 2019)](https://arxiv.org/abs/1906.04701).\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "HeadQA"
        }
      },
      {
        "value": "HeadQA - # output tokens",
        "description": "A collection of biomedical multiple-choice questions for testing medical knowledge [(Vilares et al., 2019)](https://arxiv.org/abs/1906.04701).\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "HeadQA"
        }
      },
      {
        "value": "Medbullets - # eval",
        "description": "A USMLE-style medical question dataset with multiple-choice answers and explanations [(MedBullets, 2025)](https://step2.medbullets.com).\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "Medbullets"
        }
      },
      {
        "value": "Medbullets - # train",
        "description": "A USMLE-style medical question dataset with multiple-choice answers and explanations [(MedBullets, 2025)](https://step2.medbullets.com).\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "Medbullets"
        }
      },
      {
        "value": "Medbullets - truncated",
        "description": "A USMLE-style medical question dataset with multiple-choice answers and explanations [(MedBullets, 2025)](https://step2.medbullets.com).\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "Medbullets"
        }
      },
      {
        "value": "Medbullets - # prompt tokens",
        "description": "A USMLE-style medical question dataset with multiple-choice answers and explanations [(MedBullets, 2025)](https://step2.medbullets.com).\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "Medbullets"
        }
      },
      {
        "value": "Medbullets - # output tokens",
        "description": "A USMLE-style medical question dataset with multiple-choice answers and explanations [(MedBullets, 2025)](https://step2.medbullets.com).\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "Medbullets"
        }
      },
      {
        "value": "MedAlign - # eval",
        "description": "A dataset that asks models to answer questions/follow instructions over longitudinal EHR [(Fleming et al., 2023)](https://arxiv.org/abs/2308.14089).\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "MedAlign"
        }
      },
      {
        "value": "MedAlign - # train",
        "description": "A dataset that asks models to answer questions/follow instructions over longitudinal EHR [(Fleming et al., 2023)](https://arxiv.org/abs/2308.14089).\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "MedAlign"
        }
      },
      {
        "value": "MedAlign - truncated",
        "description": "A dataset that asks models to answer questions/follow instructions over longitudinal EHR [(Fleming et al., 2023)](https://arxiv.org/abs/2308.14089).\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "MedAlign"
        }
      },
      {
        "value": "MedAlign - # prompt tokens",
        "description": "A dataset that asks models to answer questions/follow instructions over longitudinal EHR [(Fleming et al., 2023)](https://arxiv.org/abs/2308.14089).\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "MedAlign"
        }
      },
      {
        "value": "MedAlign - # output tokens",
        "description": "A dataset that asks models to answer questions/follow instructions over longitudinal EHR [(Fleming et al., 2023)](https://arxiv.org/abs/2308.14089).\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "MedAlign"
        }
      },
      {
        "value": "ADHD-Behavior - # eval",
        "description": "A dataset that classifies whether a clinical note contains a clinician recommendation for parent training in behavior management, which is the first-line evidence-based treatment for young children with ADHD [(Pillai et al., 2024)](https://doi.org/10.1093/jamia/ocae001).\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "ADHD-Behavior"
        }
      },
      {
        "value": "ADHD-Behavior - # train",
        "description": "A dataset that classifies whether a clinical note contains a clinician recommendation for parent training in behavior management, which is the first-line evidence-based treatment for young children with ADHD [(Pillai et al., 2024)](https://doi.org/10.1093/jamia/ocae001).\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "ADHD-Behavior"
        }
      },
      {
        "value": "ADHD-Behavior - truncated",
        "description": "A dataset that classifies whether a clinical note contains a clinician recommendation for parent training in behavior management, which is the first-line evidence-based treatment for young children with ADHD [(Pillai et al., 2024)](https://doi.org/10.1093/jamia/ocae001).\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "ADHD-Behavior"
        }
      },
      {
        "value": "ADHD-Behavior - # prompt tokens",
        "description": "A dataset that classifies whether a clinical note contains a clinician recommendation for parent training in behavior management, which is the first-line evidence-based treatment for young children with ADHD [(Pillai et al., 2024)](https://doi.org/10.1093/jamia/ocae001).\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "ADHD-Behavior"
        }
      },
      {
        "value": "ADHD-Behavior - # output tokens",
        "description": "A dataset that classifies whether a clinical note contains a clinician recommendation for parent training in behavior management, which is the first-line evidence-based treatment for young children with ADHD [(Pillai et al., 2024)](https://doi.org/10.1093/jamia/ocae001).\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "ADHD-Behavior"
        }
      },
      {
        "value": "ADHD-MedEffects - # eval",
        "description": "A dataset that classifies whether a clinical note contains documentation of side effect monitoring (recording of absence or presence of medication side effects), as recommended in clinical practice guidelines [(Bannet et al., 2024)](https://doi.org/10.1542/peds.2024-067223).\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "ADHD-MedEffects"
        }
      },
      {
        "value": "ADHD-MedEffects - # train",
        "description": "A dataset that classifies whether a clinical note contains documentation of side effect monitoring (recording of absence or presence of medication side effects), as recommended in clinical practice guidelines [(Bannet et al., 2024)](https://doi.org/10.1542/peds.2024-067223).\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "ADHD-MedEffects"
        }
      },
      {
        "value": "ADHD-MedEffects - truncated",
        "description": "A dataset that classifies whether a clinical note contains documentation of side effect monitoring (recording of absence or presence of medication side effects), as recommended in clinical practice guidelines [(Bannet et al., 2024)](https://doi.org/10.1542/peds.2024-067223).\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "ADHD-MedEffects"
        }
      },
      {
        "value": "ADHD-MedEffects - # prompt tokens",
        "description": "A dataset that classifies whether a clinical note contains documentation of side effect monitoring (recording of absence or presence of medication side effects), as recommended in clinical practice guidelines [(Bannet et al., 2024)](https://doi.org/10.1542/peds.2024-067223).\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "ADHD-MedEffects"
        }
      },
      {
        "value": "ADHD-MedEffects - # output tokens",
        "description": "A dataset that classifies whether a clinical note contains documentation of side effect monitoring (recording of absence or presence of medication side effects), as recommended in clinical practice guidelines [(Bannet et al., 2024)](https://doi.org/10.1542/peds.2024-067223).\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "ADHD-MedEffects"
        }
      },
      {
        "value": "DischargeMe - # eval",
        "description": "DischargeMe is a discharge instruction generation dataset and brief hospital course generation dataset collected from MIMIC-IV data, considering only the discharge text as well as the radiology report text [(Xu, 2024)](https://physionet.org/content/discharge-me/1.3/).\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "DischargeMe"
        }
      },
      {
        "value": "DischargeMe - # train",
        "description": "DischargeMe is a discharge instruction generation dataset and brief hospital course generation dataset collected from MIMIC-IV data, considering only the discharge text as well as the radiology report text [(Xu, 2024)](https://physionet.org/content/discharge-me/1.3/).\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "DischargeMe"
        }
      },
      {
        "value": "DischargeMe - truncated",
        "description": "DischargeMe is a discharge instruction generation dataset and brief hospital course generation dataset collected from MIMIC-IV data, considering only the discharge text as well as the radiology report text [(Xu, 2024)](https://physionet.org/content/discharge-me/1.3/).\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "DischargeMe"
        }
      },
      {
        "value": "DischargeMe - # prompt tokens",
        "description": "DischargeMe is a discharge instruction generation dataset and brief hospital course generation dataset collected from MIMIC-IV data, considering only the discharge text as well as the radiology report text [(Xu, 2024)](https://physionet.org/content/discharge-me/1.3/).\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "DischargeMe"
        }
      },
      {
        "value": "DischargeMe - # output tokens",
        "description": "DischargeMe is a discharge instruction generation dataset and brief hospital course generation dataset collected from MIMIC-IV data, considering only the discharge text as well as the radiology report text [(Xu, 2024)](https://physionet.org/content/discharge-me/1.3/).\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "DischargeMe"
        }
      },
      {
        "value": "ACI-Bench - # eval",
        "description": "A dataset of patient-doctor conversations paired with structured clinical notes [(Yim et al., 2024)](https://www.nature.com/articles/s41597-023-02487-3).\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "ACI-Bench"
        }
      },
      {
        "value": "ACI-Bench - # train",
        "description": "A dataset of patient-doctor conversations paired with structured clinical notes [(Yim et al., 2024)](https://www.nature.com/articles/s41597-023-02487-3).\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "ACI-Bench"
        }
      },
      {
        "value": "ACI-Bench - truncated",
        "description": "A dataset of patient-doctor conversations paired with structured clinical notes [(Yim et al., 2024)](https://www.nature.com/articles/s41597-023-02487-3).\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "ACI-Bench"
        }
      },
      {
        "value": "ACI-Bench - # prompt tokens",
        "description": "A dataset of patient-doctor conversations paired with structured clinical notes [(Yim et al., 2024)](https://www.nature.com/articles/s41597-023-02487-3).\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "ACI-Bench"
        }
      },
      {
        "value": "ACI-Bench - # output tokens",
        "description": "A dataset of patient-doctor conversations paired with structured clinical notes [(Yim et al., 2024)](https://www.nature.com/articles/s41597-023-02487-3).\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "ACI-Bench"
        }
      },
      {
        "value": "MTSamples Procedures - # eval",
        "description": "A dataset that provides a patient note regarding an operation, with the objective to document the procedure.\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "MTSamples Procedures"
        }
      },
      {
        "value": "MTSamples Procedures - # train",
        "description": "A dataset that provides a patient note regarding an operation, with the objective to document the procedure.\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "MTSamples Procedures"
        }
      },
      {
        "value": "MTSamples Procedures - truncated",
        "description": "A dataset that provides a patient note regarding an operation, with the objective to document the procedure.\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "MTSamples Procedures"
        }
      },
      {
        "value": "MTSamples Procedures - # prompt tokens",
        "description": "A dataset that provides a patient note regarding an operation, with the objective to document the procedure.\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "MTSamples Procedures"
        }
      },
      {
        "value": "MTSamples Procedures - # output tokens",
        "description": "A dataset that provides a patient note regarding an operation, with the objective to document the procedure.\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "MTSamples Procedures"
        }
      },
      {
        "value": "MIMIC-RRS - # eval",
        "description": "A dataset containing radiology reports with findings sections from MIMIC-III paired with their corresponding impression sections, used for generating radiology report summaries [(Chen et al., 2023)](https://arxiv.org/abs/2211.08584).\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "MIMIC-RRS"
        }
      },
      {
        "value": "MIMIC-RRS - # train",
        "description": "A dataset containing radiology reports with findings sections from MIMIC-III paired with their corresponding impression sections, used for generating radiology report summaries [(Chen et al., 2023)](https://arxiv.org/abs/2211.08584).\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "MIMIC-RRS"
        }
      },
      {
        "value": "MIMIC-RRS - truncated",
        "description": "A dataset containing radiology reports with findings sections from MIMIC-III paired with their corresponding impression sections, used for generating radiology report summaries [(Chen et al., 2023)](https://arxiv.org/abs/2211.08584).\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "MIMIC-RRS"
        }
      },
      {
        "value": "MIMIC-RRS - # prompt tokens",
        "description": "A dataset containing radiology reports with findings sections from MIMIC-III paired with their corresponding impression sections, used for generating radiology report summaries [(Chen et al., 2023)](https://arxiv.org/abs/2211.08584).\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "MIMIC-RRS"
        }
      },
      {
        "value": "MIMIC-RRS - # output tokens",
        "description": "A dataset containing radiology reports with findings sections from MIMIC-III paired with their corresponding impression sections, used for generating radiology report summaries [(Chen et al., 2023)](https://arxiv.org/abs/2211.08584).\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "MIMIC-RRS"
        }
      },
      {
        "value": "MIMIC-BHC - # eval",
        "description": "A summarization task using a curated collection of preprocessed discharge notes paired with their corresponding brief hospital course (BHC) summaries [(Aali et al., 2024)](https://doi.org/10.1093/jamia/ocae312).\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "MIMIC-BHC"
        }
      },
      {
        "value": "MIMIC-BHC - # train",
        "description": "A summarization task using a curated collection of preprocessed discharge notes paired with their corresponding brief hospital course (BHC) summaries [(Aali et al., 2024)](https://doi.org/10.1093/jamia/ocae312).\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "MIMIC-BHC"
        }
      },
      {
        "value": "MIMIC-BHC - truncated",
        "description": "A summarization task using a curated collection of preprocessed discharge notes paired with their corresponding brief hospital course (BHC) summaries [(Aali et al., 2024)](https://doi.org/10.1093/jamia/ocae312).\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "MIMIC-BHC"
        }
      },
      {
        "value": "MIMIC-BHC - # prompt tokens",
        "description": "A summarization task using a curated collection of preprocessed discharge notes paired with their corresponding brief hospital course (BHC) summaries [(Aali et al., 2024)](https://doi.org/10.1093/jamia/ocae312).\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "MIMIC-BHC"
        }
      },
      {
        "value": "MIMIC-BHC - # output tokens",
        "description": "A summarization task using a curated collection of preprocessed discharge notes paired with their corresponding brief hospital course (BHC) summaries [(Aali et al., 2024)](https://doi.org/10.1093/jamia/ocae312).\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "MIMIC-BHC"
        }
      },
      {
        "value": "NoteExtract - # eval",
        "description": "A dataset containing free form text of a clinical health worker care plan, with the associated goal being to restructure that text into a given format.\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "NoteExtract"
        }
      },
      {
        "value": "NoteExtract - # train",
        "description": "A dataset containing free form text of a clinical health worker care plan, with the associated goal being to restructure that text into a given format.\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "NoteExtract"
        }
      },
      {
        "value": "NoteExtract - truncated",
        "description": "A dataset containing free form text of a clinical health worker care plan, with the associated goal being to restructure that text into a given format.\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "NoteExtract"
        }
      },
      {
        "value": "NoteExtract - # prompt tokens",
        "description": "A dataset containing free form text of a clinical health worker care plan, with the associated goal being to restructure that text into a given format.\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "NoteExtract"
        }
      },
      {
        "value": "NoteExtract - # output tokens",
        "description": "A dataset containing free form text of a clinical health worker care plan, with the associated goal being to restructure that text into a given format.\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "NoteExtract"
        }
      },
      {
        "value": "MedicationQA - # eval",
        "description": "Consumer medication questions with reference answers.\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "MedicationQA"
        }
      },
      {
        "value": "MedicationQA - # train",
        "description": "Consumer medication questions with reference answers.\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "MedicationQA"
        }
      },
      {
        "value": "MedicationQA - truncated",
        "description": "Consumer medication questions with reference answers.\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "MedicationQA"
        }
      },
      {
        "value": "MedicationQA - # prompt tokens",
        "description": "Consumer medication questions with reference answers.\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "MedicationQA"
        }
      },
      {
        "value": "MedicationQA - # output tokens",
        "description": "Consumer medication questions with reference answers.\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "MedicationQA"
        }
      },
      {
        "value": "PatientInstruct - # eval",
        "description": "A dataset containing case details used to generate customized post-procedure patient instructions.\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "PatientInstruct"
        }
      },
      {
        "value": "PatientInstruct - # train",
        "description": "A dataset containing case details used to generate customized post-procedure patient instructions.\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "PatientInstruct"
        }
      },
      {
        "value": "PatientInstruct - truncated",
        "description": "A dataset containing case details used to generate customized post-procedure patient instructions.\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "PatientInstruct"
        }
      },
      {
        "value": "PatientInstruct - # prompt tokens",
        "description": "A dataset containing case details used to generate customized post-procedure patient instructions.\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "PatientInstruct"
        }
      },
      {
        "value": "PatientInstruct - # output tokens",
        "description": "A dataset containing case details used to generate customized post-procedure patient instructions.\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "PatientInstruct"
        }
      },
      {
        "value": "MedDialog - # eval",
        "description": "A collection of doctor-patient conversations with corresponding summaries.\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "MedDialog"
        }
      },
      {
        "value": "MedDialog - # train",
        "description": "A collection of doctor-patient conversations with corresponding summaries.\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "MedDialog"
        }
      },
      {
        "value": "MedDialog - truncated",
        "description": "A collection of doctor-patient conversations with corresponding summaries.\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "MedDialog"
        }
      },
      {
        "value": "MedDialog - # prompt tokens",
        "description": "A collection of doctor-patient conversations with corresponding summaries.\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "MedDialog"
        }
      },
      {
        "value": "MedDialog - # output tokens",
        "description": "A collection of doctor-patient conversations with corresponding summaries.\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "MedDialog"
        }
      },
      {
        "value": "MedConfInfo - # eval",
        "description": "A dataset of clinical notes from adolescent patients used to identify sensitive protected health information that should be restricted from parental access [(Rabbani et al., 2024)](https://jamanetwork.com/journals/jamapediatrics/fullarticle/2814109).\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "MedConfInfo"
        }
      },
      {
        "value": "MedConfInfo - # train",
        "description": "A dataset of clinical notes from adolescent patients used to identify sensitive protected health information that should be restricted from parental access [(Rabbani et al., 2024)](https://jamanetwork.com/journals/jamapediatrics/fullarticle/2814109).\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "MedConfInfo"
        }
      },
      {
        "value": "MedConfInfo - truncated",
        "description": "A dataset of clinical notes from adolescent patients used to identify sensitive protected health information that should be restricted from parental access [(Rabbani et al., 2024)](https://jamanetwork.com/journals/jamapediatrics/fullarticle/2814109).\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "MedConfInfo"
        }
      },
      {
        "value": "MedConfInfo - # prompt tokens",
        "description": "A dataset of clinical notes from adolescent patients used to identify sensitive protected health information that should be restricted from parental access [(Rabbani et al., 2024)](https://jamanetwork.com/journals/jamapediatrics/fullarticle/2814109).\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "MedConfInfo"
        }
      },
      {
        "value": "MedConfInfo - # output tokens",
        "description": "A dataset of clinical notes from adolescent patients used to identify sensitive protected health information that should be restricted from parental access [(Rabbani et al., 2024)](https://jamanetwork.com/journals/jamapediatrics/fullarticle/2814109).\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "MedConfInfo"
        }
      },
      {
        "value": "MEDIQA - # eval",
        "description": "A dataset including a medical question, a set of candidate answers, relevance annotations for ranking, and additional context to evaluate understanding and retrieval capabilities in a healthcare setting.\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "MEDIQA"
        }
      },
      {
        "value": "MEDIQA - # train",
        "description": "A dataset including a medical question, a set of candidate answers, relevance annotations for ranking, and additional context to evaluate understanding and retrieval capabilities in a healthcare setting.\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "MEDIQA"
        }
      },
      {
        "value": "MEDIQA - truncated",
        "description": "A dataset including a medical question, a set of candidate answers, relevance annotations for ranking, and additional context to evaluate understanding and retrieval capabilities in a healthcare setting.\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "MEDIQA"
        }
      },
      {
        "value": "MEDIQA - # prompt tokens",
        "description": "A dataset including a medical question, a set of candidate answers, relevance annotations for ranking, and additional context to evaluate understanding and retrieval capabilities in a healthcare setting.\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "MEDIQA"
        }
      },
      {
        "value": "MEDIQA - # output tokens",
        "description": "A dataset including a medical question, a set of candidate answers, relevance annotations for ranking, and additional context to evaluate understanding and retrieval capabilities in a healthcare setting.\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "MEDIQA"
        }
      },
      {
        "value": "MentalHealth - # eval",
        "description": "A dataset containing a counselor and mental health patient conversation, where the objective is to generate an empathetic counselor response.\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "MentalHealth"
        }
      },
      {
        "value": "MentalHealth - # train",
        "description": "A dataset containing a counselor and mental health patient conversation, where the objective is to generate an empathetic counselor response.\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "MentalHealth"
        }
      },
      {
        "value": "MentalHealth - truncated",
        "description": "A dataset containing a counselor and mental health patient conversation, where the objective is to generate an empathetic counselor response.\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "MentalHealth"
        }
      },
      {
        "value": "MentalHealth - # prompt tokens",
        "description": "A dataset containing a counselor and mental health patient conversation, where the objective is to generate an empathetic counselor response.\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "MentalHealth"
        }
      },
      {
        "value": "MentalHealth - # output tokens",
        "description": "A dataset containing a counselor and mental health patient conversation, where the objective is to generate an empathetic counselor response.\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "MentalHealth"
        }
      },
      {
        "value": "ProxySender - # eval",
        "description": "This dataset features messages sent by proxy users and non proxy users, for evaluation of LLM capabilities to determine the sender [(Tse G, et al., 2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "ProxySender"
        }
      },
      {
        "value": "ProxySender - # train",
        "description": "This dataset features messages sent by proxy users and non proxy users, for evaluation of LLM capabilities to determine the sender [(Tse G, et al., 2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "ProxySender"
        }
      },
      {
        "value": "ProxySender - truncated",
        "description": "This dataset features messages sent by proxy users and non proxy users, for evaluation of LLM capabilities to determine the sender [(Tse G, et al., 2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "ProxySender"
        }
      },
      {
        "value": "ProxySender - # prompt tokens",
        "description": "This dataset features messages sent by proxy users and non proxy users, for evaluation of LLM capabilities to determine the sender [(Tse G, et al., 2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "ProxySender"
        }
      },
      {
        "value": "ProxySender - # output tokens",
        "description": "This dataset features messages sent by proxy users and non proxy users, for evaluation of LLM capabilities to determine the sender [(Tse G, et al., 2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "ProxySender"
        }
      },
      {
        "value": "PrivacyDetection - # eval",
        "description": "A dataset that determines if a message leaks any confidential information from the patient [(Tse G, et al., 2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "PrivacyDetection"
        }
      },
      {
        "value": "PrivacyDetection - # train",
        "description": "A dataset that determines if a message leaks any confidential information from the patient [(Tse G, et al., 2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "PrivacyDetection"
        }
      },
      {
        "value": "PrivacyDetection - truncated",
        "description": "A dataset that determines if a message leaks any confidential information from the patient [(Tse G, et al., 2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "PrivacyDetection"
        }
      },
      {
        "value": "PrivacyDetection - # prompt tokens",
        "description": "A dataset that determines if a message leaks any confidential information from the patient [(Tse G, et al., 2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "PrivacyDetection"
        }
      },
      {
        "value": "PrivacyDetection - # output tokens",
        "description": "A dataset that determines if a message leaks any confidential information from the patient [(Tse G, et al., 2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "PrivacyDetection"
        }
      },
      {
        "value": "PubMedQA - # eval",
        "description": "A dataset that provides pubmed abstracts and asks associated questions yes/no/maybe questions.\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "PubMedQA"
        }
      },
      {
        "value": "PubMedQA - # train",
        "description": "A dataset that provides pubmed abstracts and asks associated questions yes/no/maybe questions.\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "PubMedQA"
        }
      },
      {
        "value": "PubMedQA - truncated",
        "description": "A dataset that provides pubmed abstracts and asks associated questions yes/no/maybe questions.\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "PubMedQA"
        }
      },
      {
        "value": "PubMedQA - # prompt tokens",
        "description": "A dataset that provides pubmed abstracts and asks associated questions yes/no/maybe questions.\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "PubMedQA"
        }
      },
      {
        "value": "PubMedQA - # output tokens",
        "description": "A dataset that provides pubmed abstracts and asks associated questions yes/no/maybe questions.\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "PubMedQA"
        }
      },
      {
        "value": "EHRSQL - # eval",
        "description": "Given a natural language instruction, generate an SQL query that would be used in clinical research.\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "EHRSQL"
        }
      },
      {
        "value": "EHRSQL - # train",
        "description": "Given a natural language instruction, generate an SQL query that would be used in clinical research.\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "EHRSQL"
        }
      },
      {
        "value": "EHRSQL - truncated",
        "description": "Given a natural language instruction, generate an SQL query that would be used in clinical research.\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "EHRSQL"
        }
      },
      {
        "value": "EHRSQL - # prompt tokens",
        "description": "Given a natural language instruction, generate an SQL query that would be used in clinical research.\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "EHRSQL"
        }
      },
      {
        "value": "EHRSQL - # output tokens",
        "description": "Given a natural language instruction, generate an SQL query that would be used in clinical research.\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "EHRSQL"
        }
      },
      {
        "value": "BMT-Status - # eval",
        "description": "A dataset containing patient notes with associated questions and answers related to bone marrow transplantation.\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "BMT-Status"
        }
      },
      {
        "value": "BMT-Status - # train",
        "description": "A dataset containing patient notes with associated questions and answers related to bone marrow transplantation.\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "BMT-Status"
        }
      },
      {
        "value": "BMT-Status - truncated",
        "description": "A dataset containing patient notes with associated questions and answers related to bone marrow transplantation.\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "BMT-Status"
        }
      },
      {
        "value": "BMT-Status - # prompt tokens",
        "description": "A dataset containing patient notes with associated questions and answers related to bone marrow transplantation.\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "BMT-Status"
        }
      },
      {
        "value": "BMT-Status - # output tokens",
        "description": "A dataset containing patient notes with associated questions and answers related to bone marrow transplantation.\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "BMT-Status"
        }
      },
      {
        "value": "RaceBias - # eval",
        "description": "A collection of LLM outputs in response to medical questions with race-based biases, with the objective being to classify whether the output contains racially biased content.\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "RaceBias"
        }
      },
      {
        "value": "RaceBias - # train",
        "description": "A collection of LLM outputs in response to medical questions with race-based biases, with the objective being to classify whether the output contains racially biased content.\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "RaceBias"
        }
      },
      {
        "value": "RaceBias - truncated",
        "description": "A collection of LLM outputs in response to medical questions with race-based biases, with the objective being to classify whether the output contains racially biased content.\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "RaceBias"
        }
      },
      {
        "value": "RaceBias - # prompt tokens",
        "description": "A collection of LLM outputs in response to medical questions with race-based biases, with the objective being to classify whether the output contains racially biased content.\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "RaceBias"
        }
      },
      {
        "value": "RaceBias - # output tokens",
        "description": "A collection of LLM outputs in response to medical questions with race-based biases, with the objective being to classify whether the output contains racially biased content.\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "RaceBias"
        }
      },
      {
        "value": "N2C2-CT - # eval",
        "description": "A dataset that provides clinical notes and asks the model to classify whether the patient is a valid candidate for a provided clinical trial.\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "N2C2-CT"
        }
      },
      {
        "value": "N2C2-CT - # train",
        "description": "A dataset that provides clinical notes and asks the model to classify whether the patient is a valid candidate for a provided clinical trial.\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "N2C2-CT"
        }
      },
      {
        "value": "N2C2-CT - truncated",
        "description": "A dataset that provides clinical notes and asks the model to classify whether the patient is a valid candidate for a provided clinical trial.\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "N2C2-CT"
        }
      },
      {
        "value": "N2C2-CT - # prompt tokens",
        "description": "A dataset that provides clinical notes and asks the model to classify whether the patient is a valid candidate for a provided clinical trial.\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "N2C2-CT"
        }
      },
      {
        "value": "N2C2-CT - # output tokens",
        "description": "A dataset that provides clinical notes and asks the model to classify whether the patient is a valid candidate for a provided clinical trial.\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "N2C2-CT"
        }
      },
      {
        "value": "MedHallu - # eval",
        "description": "A dataset of PubMed articles and associated questions, with the objective being to classify whether the answer is factual or hallucinated.\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "MedHallu"
        }
      },
      {
        "value": "MedHallu - # train",
        "description": "A dataset of PubMed articles and associated questions, with the objective being to classify whether the answer is factual or hallucinated.\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "MedHallu"
        }
      },
      {
        "value": "MedHallu - truncated",
        "description": "A dataset of PubMed articles and associated questions, with the objective being to classify whether the answer is factual or hallucinated.\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "MedHallu"
        }
      },
      {
        "value": "MedHallu - # prompt tokens",
        "description": "A dataset of PubMed articles and associated questions, with the objective being to classify whether the answer is factual or hallucinated.\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "MedHallu"
        }
      },
      {
        "value": "MedHallu - # output tokens",
        "description": "A dataset of PubMed articles and associated questions, with the objective being to classify whether the answer is factual or hallucinated.\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "MedHallu"
        }
      },
      {
        "value": "HospiceReferral - # eval",
        "description": "A dataset evaluating performance in identifying appropriate patient referrals to hospice care.\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "HospiceReferral"
        }
      },
      {
        "value": "HospiceReferral - # train",
        "description": "A dataset evaluating performance in identifying appropriate patient referrals to hospice care.\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "HospiceReferral"
        }
      },
      {
        "value": "HospiceReferral - truncated",
        "description": "A dataset evaluating performance in identifying appropriate patient referrals to hospice care.\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "HospiceReferral"
        }
      },
      {
        "value": "HospiceReferral - # prompt tokens",
        "description": "A dataset evaluating performance in identifying appropriate patient referrals to hospice care.\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "HospiceReferral"
        }
      },
      {
        "value": "HospiceReferral - # output tokens",
        "description": "A dataset evaluating performance in identifying appropriate patient referrals to hospice care.\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "HospiceReferral"
        }
      },
      {
        "value": "MIMIC-IV Billing Code - # eval",
        "description": "A  dataset pairing clinical notes from MIMIC-IV with corresponding ICD-10 billing codes.\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "MIMIC-IV Billing Code"
        }
      },
      {
        "value": "MIMIC-IV Billing Code - # train",
        "description": "A  dataset pairing clinical notes from MIMIC-IV with corresponding ICD-10 billing codes.\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "MIMIC-IV Billing Code"
        }
      },
      {
        "value": "MIMIC-IV Billing Code - truncated",
        "description": "A  dataset pairing clinical notes from MIMIC-IV with corresponding ICD-10 billing codes.\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "MIMIC-IV Billing Code"
        }
      },
      {
        "value": "MIMIC-IV Billing Code - # prompt tokens",
        "description": "A  dataset pairing clinical notes from MIMIC-IV with corresponding ICD-10 billing codes.\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "MIMIC-IV Billing Code"
        }
      },
      {
        "value": "MIMIC-IV Billing Code - # output tokens",
        "description": "A  dataset pairing clinical notes from MIMIC-IV with corresponding ICD-10 billing codes.\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "MIMIC-IV Billing Code"
        }
      },
      {
        "value": "ClinicReferral - # eval",
        "description": "A dataset containing manually curated answers to questions regarding patient referrals to the Sequoia clinic.\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "ClinicReferral"
        }
      },
      {
        "value": "ClinicReferral - # train",
        "description": "A dataset containing manually curated answers to questions regarding patient referrals to the Sequoia clinic.\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "ClinicReferral"
        }
      },
      {
        "value": "ClinicReferral - truncated",
        "description": "A dataset containing manually curated answers to questions regarding patient referrals to the Sequoia clinic.\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "ClinicReferral"
        }
      },
      {
        "value": "ClinicReferral - # prompt tokens",
        "description": "A dataset containing manually curated answers to questions regarding patient referrals to the Sequoia clinic.\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "ClinicReferral"
        }
      },
      {
        "value": "ClinicReferral - # output tokens",
        "description": "A dataset containing manually curated answers to questions regarding patient referrals to the Sequoia clinic.\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "ClinicReferral"
        }
      },
      {
        "value": "CDI-QA - # eval",
        "description": "A dataset built from Clinical Document Integrity (CDI) notes, to assess the ability to answer verification questions from previous notes.\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "CDI-QA"
        }
      },
      {
        "value": "CDI-QA - # train",
        "description": "A dataset built from Clinical Document Integrity (CDI) notes, to assess the ability to answer verification questions from previous notes.\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "CDI-QA"
        }
      },
      {
        "value": "CDI-QA - truncated",
        "description": "A dataset built from Clinical Document Integrity (CDI) notes, to assess the ability to answer verification questions from previous notes.\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "CDI-QA"
        }
      },
      {
        "value": "CDI-QA - # prompt tokens",
        "description": "A dataset built from Clinical Document Integrity (CDI) notes, to assess the ability to answer verification questions from previous notes.\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "CDI-QA"
        }
      },
      {
        "value": "CDI-QA - # output tokens",
        "description": "A dataset built from Clinical Document Integrity (CDI) notes, to assess the ability to answer verification questions from previous notes.\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "CDI-QA"
        }
      },
      {
        "value": "ENT-Referral - # eval",
        "description": "A dataset designed to evaluate performance in identifying appropriate patient referrals to Ear, Nose, and Throat specialists.\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "ENT-Referral"
        }
      },
      {
        "value": "ENT-Referral - # train",
        "description": "A dataset designed to evaluate performance in identifying appropriate patient referrals to Ear, Nose, and Throat specialists.\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "ENT-Referral"
        }
      },
      {
        "value": "ENT-Referral - truncated",
        "description": "A dataset designed to evaluate performance in identifying appropriate patient referrals to Ear, Nose, and Throat specialists.\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "ENT-Referral"
        }
      },
      {
        "value": "ENT-Referral - # prompt tokens",
        "description": "A dataset designed to evaluate performance in identifying appropriate patient referrals to Ear, Nose, and Throat specialists.\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "ENT-Referral"
        }
      },
      {
        "value": "ENT-Referral - # output tokens",
        "description": "A dataset designed to evaluate performance in identifying appropriate patient referrals to Ear, Nose, and Throat specialists.\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "ENT-Referral"
        }
      }
    ],
    "rows": [
      [
        {
          "value": "Claude 3.5 Sonnet (20241022)",
          "description": "",
          "markdown": false
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 579.799,
          "description": "min=579.799, mean=579.799, max=579.799, sum=579.799 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 2.397,
          "description": "min=2.397, mean=2.397, max=2.397, sum=2.397 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 78.61538461538461,
          "description": "min=28, mean=78.615, max=100, sum=1022 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=chronic_pain,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=homelessness,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=liver_disease,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=major_depression,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=personality_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=unemployment,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=chronic_pain,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=homelessness,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=liver_disease,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=major_depression,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=personality_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=unemployment,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=chronic_pain,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=homelessness,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=liver_disease,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=major_depression,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=personality_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=unemployment,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 480.8413416196609,
          "description": "min=446.011, mean=480.841, max=525.657, sum=6250.937 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=chronic_pain,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=homelessness,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=liver_disease,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=major_depression,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=personality_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=unemployment,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=chronic_pain,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=homelessness,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=liver_disease,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=major_depression,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=personality_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=unemployment,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 427.0,
          "description": "min=427, mean=427, max=427, sum=427 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 794.0585480093677,
          "description": "min=794.059, mean=794.059, max=794.059, sum=794.059 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 597.0,
          "description": "min=597, mean=597, max=597, sum=597 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 320.5175879396985,
          "description": "min=320.518, mean=320.518, max=320.518, sum=320.518 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 115.89614740368509,
          "description": "min=115.896, mean=115.896, max=115.896, sum=115.896 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "ehrshot:subject=lab_anemia,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "ehrshot:subject=new_hypertension,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "ehrshot:subject=lab_anemia,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "ehrshot:subject=new_hypertension,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "ehrshot:subject=lab_anemia,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "ehrshot:subject=new_hypertension,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 33185.73533333334,
          "description": "min=29931.775, mean=33185.735, max=35478.742, sum=99557.206 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "ehrshot:subject=lab_anemia,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "ehrshot:subject=new_hypertension,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "ehrshot:subject=lab_anemia,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "ehrshot:subject=new_hypertension,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 163.022,
          "description": "min=163.022, mean=163.022, max=163.022, sum=163.022 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 308.0,
          "description": "min=308, mean=308, max=308, sum=308 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 345.6266233766234,
          "description": "min=345.627, mean=345.627, max=345.627, sum=345.627 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 149.0,
          "description": "min=149, mean=149, max=149, sum=149 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 55740.29530201342,
          "description": "min=55740.295, mean=55740.295, max=55740.295, sum=55740.295 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 423.0,
          "description": "min=423, mean=423, max=423, sum=423 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 292.11820330969266,
          "description": "min=292.118, mean=292.118, max=292.118, sum=292.118 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 915.0,
          "description": "min=915, mean=915, max=915, sum=915 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 777.151912568306,
          "description": "min=777.152, mean=777.152, max=777.152, sum=777.152 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 3668.27,
          "description": "min=3668.27, mean=3668.27, max=3668.27, sum=3668.27 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 120.0,
          "description": "min=120, mean=120, max=120, sum=120 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1674.0583333333334,
          "description": "min=1674.058, mean=1674.058, max=1674.058, sum=1674.058 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 128.0,
          "description": "min=128, mean=128, max=128, sum=128 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 309.640625,
          "description": "min=309.641, mean=309.641, max=309.641, sum=309.641 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 212.638,
          "description": "min=212.638, mean=212.638, max=212.638, sum=212.638 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 100.0,
          "description": "min=100, mean=100, max=100, sum=100 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 653.94,
          "description": "min=653.94, mean=653.94, max=653.94, sum=653.94 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 487.0,
          "description": "min=487, mean=487, max=487, sum=487 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 779.6509240246406,
          "description": "min=779.651, mean=779.651, max=779.651, sum=779.651 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 689.0,
          "description": "min=689, mean=689, max=689, sum=689 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 25.08708272859216,
          "description": "min=25.087, mean=25.087, max=25.087, sum=25.087 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 361.0,
          "description": "min=361, mean=361, max=361, sum=361 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 2571.130193905817,
          "description": "min=2571.13, mean=2571.13, max=2571.13, sum=2571.13 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=2000 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "med_dialog,subset=icliniq:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "med_dialog,subset=icliniq:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "med_dialog,subset=icliniq:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 267.246,
          "description": "min=244.906, mean=267.246, max=289.586, sum=534.492 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "med_dialog,subset=icliniq:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "med_dialog,subset=icliniq:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 2062.048,
          "description": "min=2062.048, mean=2062.048, max=2062.048, sum=2062.048 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 150.0,
          "description": "min=150, mean=150, max=150, sum=150 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 40.62,
          "description": "min=40.62, mean=40.62, max=40.62, sum=40.62 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 67.0,
          "description": "min=67, mean=67, max=67, sum=67 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 703.9402985074627,
          "description": "min=703.94, mean=703.94, max=703.94, sum=703.94 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 300.0,
          "description": "min=300, mean=300, max=300, sum=300 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 188.42333333333335,
          "description": "min=188.423, mean=188.423, max=188.423, sum=188.423 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 300.0,
          "description": "min=300, mean=300, max=300, sum=300 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 265.67,
          "description": "min=265.67, mean=265.67, max=265.67, sum=265.67 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 375.302,
          "description": "min=375.302, mean=375.302, max=375.302, sum=375.302 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1126.031,
          "description": "min=1126.031, mean=1126.031, max=1126.031, sum=1126.031 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 79.787,
          "description": "min=79.787, mean=79.787, max=79.787, sum=79.787 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 220.0,
          "description": "min=220, mean=220, max=220, sum=220 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 2041.9727272727273,
          "description": "min=2041.973, mean=2041.973, max=2041.973, sum=2041.973 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 167.0,
          "description": "min=167, mean=167, max=167, sum=167 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 490.29940119760477,
          "description": "min=490.299, mean=490.299, max=490.299, sum=490.299 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 86.0,
          "description": "min=86, mean=86, max=86, sum=258 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "n2c2_ct_matching:subject=CREATININE,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "n2c2_ct_matching:subject=CREATININE,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "n2c2_ct_matching:subject=CREATININE,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 5753.391472868217,
          "description": "min=5702.058, mean=5753.391, max=5830.058, sum=17260.174 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "n2c2_ct_matching:subject=CREATININE,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "n2c2_ct_matching:subject=CREATININE,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 715.166,
          "description": "min=715.166, mean=715.166, max=715.166, sum=715.166 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 22974.077,
          "description": "min=22974.077, mean=22974.077, max=22974.077, sum=22974.077 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimiciv_billing_code:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "value": 327.0,
          "description": "min=327, mean=327, max=327, sum=327 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 40203.091743119265,
          "description": "min=40203.092, mean=40203.092, max=40203.092, sum=40203.092 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 63153.442,
          "description": "min=63153.442, mean=63153.442, max=63153.442, sum=63153.442 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 15388.833,
          "description": "min=15388.833, mean=15388.833, max=15388.833, sum=15388.833 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        }
      ],
      [
        {
          "value": "Claude 3.7 Sonnet (20250219)",
          "description": "",
          "markdown": false
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 579.799,
          "description": "min=579.799, mean=579.799, max=579.799, sum=579.799 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 2.091,
          "description": "min=2.091, mean=2.091, max=2.091, sum=2.091 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 78.61538461538461,
          "description": "min=28, mean=78.615, max=100, sum=1022 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=chronic_pain,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=homelessness,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=liver_disease,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=major_depression,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=personality_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=unemployment,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=chronic_pain,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=homelessness,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=liver_disease,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=major_depression,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=personality_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=unemployment,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=chronic_pain,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=homelessness,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=liver_disease,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=major_depression,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=personality_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=unemployment,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 480.8413416196609,
          "description": "min=446.011, mean=480.841, max=525.657, sum=6250.937 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=chronic_pain,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=homelessness,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=liver_disease,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=major_depression,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=personality_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=unemployment,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=chronic_pain,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=homelessness,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=liver_disease,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=major_depression,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=personality_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=unemployment,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 427.0,
          "description": "min=427, mean=427, max=427, sum=427 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 794.0585480093677,
          "description": "min=794.059, mean=794.059, max=794.059, sum=794.059 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 597.0,
          "description": "min=597, mean=597, max=597, sum=597 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 320.5175879396985,
          "description": "min=320.518, mean=320.518, max=320.518, sum=320.518 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 17.23785594639866,
          "description": "min=17.238, mean=17.238, max=17.238, sum=17.238 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "ehrshot:subject=lab_anemia,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "ehrshot:subject=new_hypertension,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "ehrshot:subject=lab_anemia,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "ehrshot:subject=new_hypertension,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "ehrshot:subject=lab_anemia,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "ehrshot:subject=new_hypertension,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 33185.73533333334,
          "description": "min=29931.775, mean=33185.735, max=35478.742, sum=99557.206 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "ehrshot:subject=lab_anemia,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "ehrshot:subject=new_hypertension,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "ehrshot:subject=lab_anemia,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "ehrshot:subject=new_hypertension,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 163.022,
          "description": "min=163.022, mean=163.022, max=163.022, sum=163.022 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 308.0,
          "description": "min=308, mean=308, max=308, sum=308 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 345.6266233766234,
          "description": "min=345.627, mean=345.627, max=345.627, sum=345.627 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 149.0,
          "description": "min=149, mean=149, max=149, sum=149 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 55740.29530201342,
          "description": "min=55740.295, mean=55740.295, max=55740.295, sum=55740.295 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 423.0,
          "description": "min=423, mean=423, max=423, sum=423 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 292.11820330969266,
          "description": "min=292.118, mean=292.118, max=292.118, sum=292.118 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 915.0,
          "description": "min=915, mean=915, max=915, sum=915 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 777.151912568306,
          "description": "min=777.152, mean=777.152, max=777.152, sum=777.152 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 3668.27,
          "description": "min=3668.27, mean=3668.27, max=3668.27, sum=3668.27 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 120.0,
          "description": "min=120, mean=120, max=120, sum=120 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1674.0583333333334,
          "description": "min=1674.058, mean=1674.058, max=1674.058, sum=1674.058 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 128.0,
          "description": "min=128, mean=128, max=128, sum=128 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 309.640625,
          "description": "min=309.641, mean=309.641, max=309.641, sum=309.641 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 212.638,
          "description": "min=212.638, mean=212.638, max=212.638, sum=212.638 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 100.0,
          "description": "min=100, mean=100, max=100, sum=100 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 653.94,
          "description": "min=653.94, mean=653.94, max=653.94, sum=653.94 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 487.0,
          "description": "min=487, mean=487, max=487, sum=487 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 779.6509240246406,
          "description": "min=779.651, mean=779.651, max=779.651, sum=779.651 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 689.0,
          "description": "min=689, mean=689, max=689, sum=689 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 25.08708272859216,
          "description": "min=25.087, mean=25.087, max=25.087, sum=25.087 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 361.0,
          "description": "min=361, mean=361, max=361, sum=361 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 2571.130193905817,
          "description": "min=2571.13, mean=2571.13, max=2571.13, sum=2571.13 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=2000 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "med_dialog,subset=icliniq:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "med_dialog,subset=icliniq:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "med_dialog,subset=icliniq:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 267.246,
          "description": "min=244.906, mean=267.246, max=289.586, sum=534.492 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "med_dialog,subset=icliniq:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "med_dialog,subset=icliniq:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 2062.048,
          "description": "min=2062.048, mean=2062.048, max=2062.048, sum=2062.048 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 150.0,
          "description": "min=150, mean=150, max=150, sum=150 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 40.62,
          "description": "min=40.62, mean=40.62, max=40.62, sum=40.62 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 67.0,
          "description": "min=67, mean=67, max=67, sum=67 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 703.9402985074627,
          "description": "min=703.94, mean=703.94, max=703.94, sum=703.94 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 300.0,
          "description": "min=300, mean=300, max=300, sum=300 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 188.42333333333335,
          "description": "min=188.423, mean=188.423, max=188.423, sum=188.423 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 300.0,
          "description": "min=300, mean=300, max=300, sum=300 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 265.67,
          "description": "min=265.67, mean=265.67, max=265.67, sum=265.67 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 375.302,
          "description": "min=375.302, mean=375.302, max=375.302, sum=375.302 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1126.031,
          "description": "min=1126.031, mean=1126.031, max=1126.031, sum=1126.031 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 81.993,
          "description": "min=81.993, mean=81.993, max=81.993, sum=81.993 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 220.0,
          "description": "min=220, mean=220, max=220, sum=220 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 2041.9727272727273,
          "description": "min=2041.973, mean=2041.973, max=2041.973, sum=2041.973 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 167.0,
          "description": "min=167, mean=167, max=167, sum=167 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 490.29940119760477,
          "description": "min=490.299, mean=490.299, max=490.299, sum=490.299 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 86.0,
          "description": "min=86, mean=86, max=86, sum=258 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "n2c2_ct_matching:subject=CREATININE,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "n2c2_ct_matching:subject=CREATININE,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "n2c2_ct_matching:subject=CREATININE,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 5753.391472868217,
          "description": "min=5702.058, mean=5753.391, max=5830.058, sum=17260.174 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "n2c2_ct_matching:subject=CREATININE,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "n2c2_ct_matching:subject=CREATININE,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 715.166,
          "description": "min=715.166, mean=715.166, max=715.166, sum=715.166 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 22974.077,
          "description": "min=22974.077, mean=22974.077, max=22974.077, sum=22974.077 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimiciv_billing_code:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "value": 327.0,
          "description": "min=327, mean=327, max=327, sum=327 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 40203.091743119265,
          "description": "min=40203.092, mean=40203.092, max=40203.092, sum=40203.092 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 63153.442,
          "description": "min=63153.442, mean=63153.442, max=63153.442, sum=63153.442 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 15388.833,
          "description": "min=15388.833, mean=15388.833, max=15388.833, sum=15388.833 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        }
      ],
      [
        {
          "value": "DeepSeek R1",
          "description": "",
          "markdown": false
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 551.78,
          "description": "min=551.78, mean=551.78, max=551.78, sum=551.78 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 78.61538461538461,
          "description": "min=28, mean=78.615, max=100, sum=1022 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=bipolar_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=chronic_pain,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=homelessness,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=liver_disease,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=major_depression,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=personality_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=substance_use_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=suicidal_behavior,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=tobacco_dependence,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=unemployment,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=bipolar_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=chronic_pain,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=homelessness,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=liver_disease,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=major_depression,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=personality_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=substance_use_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=suicidal_behavior,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=tobacco_dependence,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=unemployment,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=bipolar_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=chronic_pain,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=homelessness,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=liver_disease,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=major_depression,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=personality_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=substance_use_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=suicidal_behavior,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=tobacco_dependence,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=unemployment,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 478.6682976038255,
          "description": "min=441.728, mean=478.668, max=512.09, sum=6222.688 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=bipolar_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=chronic_pain,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=homelessness,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=liver_disease,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=major_depression,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=personality_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=substance_use_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=suicidal_behavior,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=tobacco_dependence,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=unemployment,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.9976254180602008,
          "description": "min=0.989, mean=0.998, max=1, sum=12.969 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=bipolar_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=chronic_pain,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=homelessness,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=liver_disease,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=major_depression,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=personality_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=substance_use_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=suicidal_behavior,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=tobacco_dependence,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=unemployment,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 427.0,
          "description": "min=427, mean=427, max=427, sum=427 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 756.0468384074942,
          "description": "min=756.047, mean=756.047, max=756.047, sum=756.047 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 596.5573770491803,
          "description": "min=596.557, mean=596.557, max=596.557, sum=596.557 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 597.0,
          "description": "min=597, mean=597, max=597, sum=597 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 304.89447236180905,
          "description": "min=304.894, mean=304.894, max=304.894, sum=304.894 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "ehrshot:subject=lab_anemia,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "ehrshot:subject=new_hypertension,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "ehrshot:subject=lab_anemia,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "ehrshot:subject=new_hypertension,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "ehrshot:subject=lab_anemia,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "ehrshot:subject=new_hypertension,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 32781.75400000001,
          "description": "min=29560.991, mean=32781.754, max=35063.189, sum=98345.262 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "ehrshot:subject=lab_anemia,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "ehrshot:subject=new_hypertension,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.975,
          "description": "min=0.952, mean=0.975, max=0.994, sum=2.925 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "ehrshot:subject=lab_anemia,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "ehrshot:subject=new_hypertension,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 146.889,
          "description": "min=146.889, mean=146.889, max=146.889, sum=146.889 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 308.0,
          "description": "min=308, mean=308, max=308, sum=308 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 330.1655844155844,
          "description": "min=330.166, mean=330.166, max=330.166, sum=330.166 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 149.0,
          "description": "min=149, mean=149, max=149, sum=149 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 55318.20134228188,
          "description": "min=55318.201, mean=55318.201, max=55318.201, sum=55318.201 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 276.1275167785235,
          "description": "min=276.128, mean=276.128, max=276.128, sum=276.128 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 423.0,
          "description": "min=423, mean=423, max=423, sum=423 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 279.1985815602837,
          "description": "min=279.199, mean=279.199, max=279.199, sum=279.199 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 915.0,
          "description": "min=915, mean=915, max=915, sum=915 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 730.0043715846995,
          "description": "min=730.004, mean=730.004, max=730.004, sum=730.004 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.9989071038251366,
          "description": "min=0.999, mean=0.999, max=0.999, sum=0.999 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 3383.91,
          "description": "min=3383.91, mean=3383.91, max=3383.91, sum=3383.91 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 472.373,
          "description": "min=472.373, mean=472.373, max=472.373, sum=472.373 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 120.0,
          "description": "min=120, mean=120, max=120, sum=120 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1613.1666666666667,
          "description": "min=1613.167, mean=1613.167, max=1613.167, sum=1613.167 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 465.925,
          "description": "min=465.925, mean=465.925, max=465.925, sum=465.925 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 128.0,
          "description": "min=128, mean=128, max=128, sum=128 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 289.765625,
          "description": "min=289.766, mean=289.766, max=289.766, sum=289.766 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 590.8515625,
          "description": "min=590.852, mean=590.852, max=590.852, sum=590.852 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 199.291,
          "description": "min=199.291, mean=199.291, max=199.291, sum=199.291 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 94.074,
          "description": "min=94.074, mean=94.074, max=94.074, sum=94.074 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 100.0,
          "description": "min=100, mean=100, max=100, sum=100 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 635.72,
          "description": "min=635.72, mean=635.72, max=635.72, sum=635.72 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 149.09,
          "description": "min=149.09, mean=149.09, max=149.09, sum=149.09 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 487.0,
          "description": "min=487, mean=487, max=487, sum=487 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 721.3819301848049,
          "description": "min=721.382, mean=721.382, max=721.382, sum=721.382 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 127.06570841889118,
          "description": "min=127.066, mean=127.066, max=127.066, sum=127.066 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 689.0,
          "description": "min=689, mean=689, max=689, sum=689 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 22.378809869375907,
          "description": "min=22.379, mean=22.379, max=22.379, sum=22.379 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 285.86066763425254,
          "description": "min=285.861, mean=285.861, max=285.861, sum=285.861 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 361.0,
          "description": "min=361, mean=361, max=361, sum=361 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 2541.786703601108,
          "description": "min=2541.787, mean=2541.787, max=2541.787, sum=2541.787 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 699.0747922437673,
          "description": "min=699.075, mean=699.075, max=699.075, sum=699.075 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=2000 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "med_dialog,subset=icliniq:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "med_dialog,subset=icliniq:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "med_dialog,subset=icliniq:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 260.207,
          "description": "min=239.027, mean=260.207, max=281.387, sum=520.414 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "med_dialog,subset=icliniq:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 58.4075,
          "description": "min=57.567, mean=58.407, max=59.248, sum=116.815 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "med_dialog,subset=icliniq:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 2017.948,
          "description": "min=2017.948, mean=2017.948, max=2017.948, sum=2017.948 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 150.0,
          "description": "min=150, mean=150, max=150, sum=150 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 37.50666666666667,
          "description": "min=37.507, mean=37.507, max=37.507, sum=37.507 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 421.92,
          "description": "min=421.92, mean=421.92, max=421.92, sum=421.92 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 67.0,
          "description": "min=67, mean=67, max=67, sum=67 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 687.7014925373135,
          "description": "min=687.701, mean=687.701, max=687.701, sum=687.701 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 240.29850746268656,
          "description": "min=240.299, mean=240.299, max=240.299, sum=240.299 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 300.0,
          "description": "min=300, mean=300, max=300, sum=300 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 183.22666666666666,
          "description": "min=183.227, mean=183.227, max=183.227, sum=183.227 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 300.0,
          "description": "min=300, mean=300, max=300, sum=300 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 256.9033333333333,
          "description": "min=256.903, mean=256.903, max=256.903, sum=256.903 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 364.658,
          "description": "min=364.658, mean=364.658, max=364.658, sum=364.658 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1002.984,
          "description": "min=1002.984, mean=1002.984, max=1002.984, sum=1002.984 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 220.0,
          "description": "min=220, mean=220, max=220, sum=220 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 2042.3363636363636,
          "description": "min=2042.336, mean=2042.336, max=2042.336, sum=2042.336 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1.0363636363636364,
          "description": "min=1.036, mean=1.036, max=1.036, sum=1.036 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 167.0,
          "description": "min=167, mean=167, max=167, sum=167 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 466.94610778443115,
          "description": "min=466.946, mean=466.946, max=466.946, sum=466.946 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 86.0,
          "description": "min=86, mean=86, max=86, sum=258 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "n2c2_ct_matching:subject=ADVANCED-CAD,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "n2c2_ct_matching:subject=CREATININE,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "n2c2_ct_matching:subject=ADVANCED-CAD,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "n2c2_ct_matching:subject=CREATININE,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "n2c2_ct_matching:subject=ADVANCED-CAD,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "n2c2_ct_matching:subject=CREATININE,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 5437.860465116279,
          "description": "min=5387.86, mean=5437.86, max=5515.86, sum=16313.581 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "n2c2_ct_matching:subject=ADVANCED-CAD,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "n2c2_ct_matching:subject=CREATININE,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.9922480620155039,
          "description": "min=0.977, mean=0.992, max=1, sum=2.977 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "n2c2_ct_matching:subject=ADVANCED-CAD,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "n2c2_ct_matching:subject=CREATININE,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 694.905,
          "description": "min=694.905, mean=694.905, max=694.905, sum=694.905 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 22653.558,
          "description": "min=22653.558, mean=22653.558, max=22653.558, sum=22653.558 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1.003,
          "description": "min=1.003, mean=1.003, max=1.003, sum=1.003 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimiciv_billing_code:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "value": 327.0,
          "description": "min=327, mean=327, max=327, sum=327 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 39468.477064220184,
          "description": "min=39468.477, mean=39468.477, max=39468.477, sum=39468.477 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1.003058103975535,
          "description": "min=1.003, mean=1.003, max=1.003, sum=1.003 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 62288.604,
          "description": "min=62288.604, mean=62288.604, max=62288.604, sum=62288.604 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1.017,
          "description": "min=1.017, mean=1.017, max=1.017, sum=1.017 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 15157.31,
          "description": "min=15157.31, mean=15157.31, max=15157.31, sum=15157.31 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1.062,
          "description": "min=1.062, mean=1.062, max=1.062, sum=1.062 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        }
      ],
      [
        {
          "value": "Gemini 1.5 Pro (001)",
          "description": "",
          "markdown": false
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 586.742,
          "description": "min=586.742, mean=586.742, max=586.742, sum=586.742 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 78.61538461538461,
          "description": "min=28, mean=78.615, max=100, sum=1022 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=chronic_pain,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=homelessness,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=liver_disease,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=major_depression,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=personality_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=unemployment,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=chronic_pain,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=homelessness,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=liver_disease,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=major_depression,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=personality_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=unemployment,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=chronic_pain,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=homelessness,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=liver_disease,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=major_depression,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=personality_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=unemployment,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 500.5226477055633,
          "description": "min=457.902, mean=500.523, max=535.478, sum=6506.794 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=chronic_pain,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=homelessness,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=liver_disease,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=major_depression,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=personality_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=unemployment,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=chronic_pain,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=homelessness,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=liver_disease,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=major_depression,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=personality_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=unemployment,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 427.0,
          "description": "min=427, mean=427, max=427, sum=427 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 759.2295081967213,
          "description": "min=759.23, mean=759.23, max=759.23, sum=759.23 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 597.0,
          "description": "min=597, mean=597, max=597, sum=597 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 327.09212730318256,
          "description": "min=327.092, mean=327.092, max=327.092, sum=327.092 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "ehrshot:subject=lab_anemia,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "ehrshot:subject=new_hypertension,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "ehrshot:subject=lab_anemia,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "ehrshot:subject=new_hypertension,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "ehrshot:subject=lab_anemia,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "ehrshot:subject=new_hypertension,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 43286.37100000001,
          "description": "min=38964.654, mean=43286.371, max=46266.454, sum=129859.113 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "ehrshot:subject=lab_anemia,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "ehrshot:subject=new_hypertension,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "ehrshot:subject=lab_anemia,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "ehrshot:subject=new_hypertension,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 153.899,
          "description": "min=153.899, mean=153.899, max=153.899, sum=153.899 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 308.0,
          "description": "min=308, mean=308, max=308, sum=308 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 347.45454545454544,
          "description": "min=347.455, mean=347.455, max=347.455, sum=347.455 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 149.0,
          "description": "min=149, mean=149, max=149, sum=149 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 60152.4899328859,
          "description": "min=60152.49, mean=60152.49, max=60152.49, sum=60152.49 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 423.0,
          "description": "min=423, mean=423, max=423, sum=423 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 277.21513002364065,
          "description": "min=277.215, mean=277.215, max=277.215, sum=277.215 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 915.0,
          "description": "min=915, mean=915, max=915, sum=915 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 740.8524590163935,
          "description": "min=740.852, mean=740.852, max=740.852, sum=740.852 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 3593.987,
          "description": "min=3593.987, mean=3593.987, max=3593.987, sum=3593.987 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 120.0,
          "description": "min=120, mean=120, max=120, sum=120 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1677.5333333333333,
          "description": "min=1677.533, mean=1677.533, max=1677.533, sum=1677.533 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 128.0,
          "description": "min=128, mean=128, max=128, sum=128 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 289.015625,
          "description": "min=289.016, mean=289.016, max=289.016, sum=289.016 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 197.6,
          "description": "min=197.6, mean=197.6, max=197.6, sum=197.6 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 100.0,
          "description": "min=100, mean=100, max=100, sum=100 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 615.41,
          "description": "min=615.41, mean=615.41, max=615.41, sum=615.41 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 487.0,
          "description": "min=487, mean=487, max=487, sum=487 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 736.747433264887,
          "description": "min=736.747, mean=736.747, max=736.747, sum=736.747 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 689.0,
          "description": "min=689, mean=689, max=689, sum=689 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 23.423802612481857,
          "description": "min=23.424, mean=23.424, max=23.424, sum=23.424 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 361.0,
          "description": "min=361, mean=361, max=361, sum=361 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 2606.409972299169,
          "description": "min=2606.41, mean=2606.41, max=2606.41, sum=2606.41 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=2000 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "med_dialog,subset=icliniq:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "med_dialog,subset=icliniq:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "med_dialog,subset=icliniq:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 261.433,
          "description": "min=238.927, mean=261.433, max=283.939, sum=522.866 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "med_dialog,subset=icliniq:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "med_dialog,subset=icliniq:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 2087.742,
          "description": "min=2087.742, mean=2087.742, max=2087.742, sum=2087.742 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 150.0,
          "description": "min=150, mean=150, max=150, sum=150 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 38.906666666666666,
          "description": "min=38.907, mean=38.907, max=38.907, sum=38.907 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 67.0,
          "description": "min=67, mean=67, max=67, sum=67 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 703.044776119403,
          "description": "min=703.045, mean=703.045, max=703.045, sum=703.045 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 300.0,
          "description": "min=300, mean=300, max=300, sum=300 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 187.90666666666667,
          "description": "min=187.907, mean=187.907, max=187.907, sum=187.907 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 300.0,
          "description": "min=300, mean=300, max=300, sum=300 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 265.4066666666667,
          "description": "min=265.407, mean=265.407, max=265.407, sum=265.407 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 383.966,
          "description": "min=383.966, mean=383.966, max=383.966, sum=383.966 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1112.729,
          "description": "min=1112.729, mean=1112.729, max=1112.729, sum=1112.729 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 220.0,
          "description": "min=220, mean=220, max=220, sum=220 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 2155.181818181818,
          "description": "min=2155.182, mean=2155.182, max=2155.182, sum=2155.182 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 167.0,
          "description": "min=167, mean=167, max=167, sum=167 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 490.7544910179641,
          "description": "min=490.754, mean=490.754, max=490.754, sum=490.754 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 86.0,
          "description": "min=86, mean=86, max=86, sum=258 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "n2c2_ct_matching:subject=CREATININE,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "n2c2_ct_matching:subject=CREATININE,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "n2c2_ct_matching:subject=CREATININE,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 5833.290697674419,
          "description": "min=5783.291, mean=5833.291, max=5908.291, sum=17499.872 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "n2c2_ct_matching:subject=CREATININE,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "n2c2_ct_matching:subject=CREATININE,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 718.045,
          "description": "min=718.045, mean=718.045, max=718.045, sum=718.045 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 23792.148,
          "description": "min=23792.148, mean=23792.148, max=23792.148, sum=23792.148 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimiciv_billing_code:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "value": 327.0,
          "description": "min=327, mean=327, max=327, sum=327 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 41037.605504587154,
          "description": "min=41037.606, mean=41037.606, max=41037.606, sum=41037.606 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 65242.146,
          "description": "min=65242.146, mean=65242.146, max=65242.146, sum=65242.146 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 15723.329,
          "description": "min=15723.329, mean=15723.329, max=15723.329, sum=15723.329 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        }
      ],
      [
        {
          "value": "Gemini 2.0 Flash",
          "description": "",
          "markdown": false
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 586.742,
          "description": "min=586.742, mean=586.742, max=586.742, sum=586.742 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 78.61538461538461,
          "description": "min=28, mean=78.615, max=100, sum=1022 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=chronic_pain,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=homelessness,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=liver_disease,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=major_depression,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=personality_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=unemployment,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=chronic_pain,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=homelessness,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=liver_disease,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=major_depression,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=personality_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=unemployment,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=chronic_pain,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=homelessness,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=liver_disease,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=major_depression,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=personality_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=unemployment,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 500.5226477055633,
          "description": "min=457.902, mean=500.523, max=535.478, sum=6506.794 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=chronic_pain,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=homelessness,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=liver_disease,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=major_depression,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=personality_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=unemployment,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=chronic_pain,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=homelessness,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=liver_disease,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=major_depression,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=personality_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=unemployment,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 427.0,
          "description": "min=427, mean=427, max=427, sum=427 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 759.2295081967213,
          "description": "min=759.23, mean=759.23, max=759.23, sum=759.23 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 597.0,
          "description": "min=597, mean=597, max=597, sum=597 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 327.09212730318256,
          "description": "min=327.092, mean=327.092, max=327.092, sum=327.092 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "ehrshot:subject=lab_anemia,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "ehrshot:subject=new_hypertension,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "ehrshot:subject=lab_anemia,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "ehrshot:subject=new_hypertension,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "ehrshot:subject=lab_anemia,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "ehrshot:subject=new_hypertension,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 43286.37100000001,
          "description": "min=38964.654, mean=43286.371, max=46266.454, sum=129859.113 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "ehrshot:subject=lab_anemia,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "ehrshot:subject=new_hypertension,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "ehrshot:subject=lab_anemia,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "ehrshot:subject=new_hypertension,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 153.899,
          "description": "min=153.899, mean=153.899, max=153.899, sum=153.899 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 308.0,
          "description": "min=308, mean=308, max=308, sum=308 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 347.45454545454544,
          "description": "min=347.455, mean=347.455, max=347.455, sum=347.455 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 149.0,
          "description": "min=149, mean=149, max=149, sum=149 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 60152.4899328859,
          "description": "min=60152.49, mean=60152.49, max=60152.49, sum=60152.49 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 423.0,
          "description": "min=423, mean=423, max=423, sum=423 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 278.21513002364065,
          "description": "min=278.215, mean=278.215, max=278.215, sum=278.215 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 915.0,
          "description": "min=915, mean=915, max=915, sum=915 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 741.8524590163935,
          "description": "min=741.852, mean=741.852, max=741.852, sum=741.852 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 3593.987,
          "description": "min=3593.987, mean=3593.987, max=3593.987, sum=3593.987 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 120.0,
          "description": "min=120, mean=120, max=120, sum=120 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 1677.5333333333333,
          "description": "min=1677.533, mean=1677.533, max=1677.533, sum=1677.533 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 128.0,
          "description": "min=128, mean=128, max=128, sum=128 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 289.015625,
          "description": "min=289.016, mean=289.016, max=289.016, sum=289.016 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 197.6,
          "description": "min=197.6, mean=197.6, max=197.6, sum=197.6 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 100.0,
          "description": "min=100, mean=100, max=100, sum=100 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 615.41,
          "description": "min=615.41, mean=615.41, max=615.41, sum=615.41 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 487.0,
          "description": "min=487, mean=487, max=487, sum=487 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 736.747433264887,
          "description": "min=736.747, mean=736.747, max=736.747, sum=736.747 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 689.0,
          "description": "min=689, mean=689, max=689, sum=689 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 23.423802612481857,
          "description": "min=23.424, mean=23.424, max=23.424, sum=23.424 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 361.0,
          "description": "min=361, mean=361, max=361, sum=361 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 2606.409972299169,
          "description": "min=2606.41, mean=2606.41, max=2606.41, sum=2606.41 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=2000 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "med_dialog,subset=icliniq:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "med_dialog,subset=icliniq:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "med_dialog,subset=icliniq:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 261.433,
          "description": "min=238.927, mean=261.433, max=283.939, sum=522.866 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "med_dialog,subset=icliniq:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "med_dialog,subset=icliniq:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 2089.742,
          "description": "min=2089.742, mean=2089.742, max=2089.742, sum=2089.742 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 150.0,
          "description": "min=150, mean=150, max=150, sum=150 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 38.906666666666666,
          "description": "min=38.907, mean=38.907, max=38.907, sum=38.907 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 67.0,
          "description": "min=67, mean=67, max=67, sum=67 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 703.044776119403,
          "description": "min=703.045, mean=703.045, max=703.045, sum=703.045 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 300.0,
          "description": "min=300, mean=300, max=300, sum=300 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 187.90666666666667,
          "description": "min=187.907, mean=187.907, max=187.907, sum=187.907 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 300.0,
          "description": "min=300, mean=300, max=300, sum=300 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 265.4066666666667,
          "description": "min=265.407, mean=265.407, max=265.407, sum=265.407 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 383.966,
          "description": "min=383.966, mean=383.966, max=383.966, sum=383.966 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 1112.729,
          "description": "min=1112.729, mean=1112.729, max=1112.729, sum=1112.729 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 220.0,
          "description": "min=220, mean=220, max=220, sum=220 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 2157.181818181818,
          "description": "min=2157.182, mean=2157.182, max=2157.182, sum=2157.182 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 167.0,
          "description": "min=167, mean=167, max=167, sum=167 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 490.7544910179641,
          "description": "min=490.754, mean=490.754, max=490.754, sum=490.754 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 86.0,
          "description": "min=86, mean=86, max=86, sum=258 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "n2c2_ct_matching:subject=CREATININE,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "n2c2_ct_matching:subject=CREATININE,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "n2c2_ct_matching:subject=CREATININE,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 5833.290697674419,
          "description": "min=5783.291, mean=5833.291, max=5908.291, sum=17499.872 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "n2c2_ct_matching:subject=CREATININE,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "n2c2_ct_matching:subject=CREATININE,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 718.045,
          "description": "min=718.045, mean=718.045, max=718.045, sum=718.045 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 23794.148,
          "description": "min=23794.148, mean=23794.148, max=23794.148, sum=23794.148 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimiciv_billing_code:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "value": 327.0,
          "description": "min=327, mean=327, max=327, sum=327 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 41039.605504587154,
          "description": "min=41039.606, mean=41039.606, max=41039.606, sum=41039.606 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 65244.146,
          "description": "min=65244.146, mean=65244.146, max=65244.146, sum=65244.146 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 15725.329,
          "description": "min=15725.329, mean=15725.329, max=15725.329, sum=15725.329 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        }
      ],
      [
        {
          "value": "GPT-4o (2024-05-13)",
          "description": "",
          "markdown": false
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 570.684,
          "description": "min=570.684, mean=570.684, max=570.684, sum=570.684 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.904,
          "description": "min=1.904, mean=1.904, max=1.904, sum=1.904 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 78.61538461538461,
          "description": "min=28, mean=78.615, max=100, sum=1022 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=chronic_pain,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=homelessness,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=liver_disease,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=major_depression,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=personality_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=unemployment,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=chronic_pain,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=homelessness,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=liver_disease,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=major_depression,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=personality_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=unemployment,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=chronic_pain,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=homelessness,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=liver_disease,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=major_depression,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=personality_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=unemployment,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 483.5058692520385,
          "description": "min=443.989, mean=483.506, max=519.567, sum=6285.576 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=chronic_pain,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=homelessness,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=liver_disease,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=major_depression,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=personality_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=unemployment,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.9984615384615385,
          "description": "min=0.99, mean=0.998, max=1, sum=12.98 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=chronic_pain,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=homelessness,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=liver_disease,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=major_depression,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=personality_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=unemployment,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 427.0,
          "description": "min=427, mean=427, max=427, sum=427 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 771.903981264637,
          "description": "min=771.904, mean=771.904, max=771.904, sum=771.904 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 450.1217798594848,
          "description": "min=450.122, mean=450.122, max=450.122, sum=450.122 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 597.0,
          "description": "min=597, mean=597, max=597, sum=597 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 308.2579564489112,
          "description": "min=308.258, mean=308.258, max=308.258, sum=308.258 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 19.77721943048576,
          "description": "min=19.777, mean=19.777, max=19.777, sum=19.777 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "ehrshot:subject=lab_anemia,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "ehrshot:subject=new_hypertension,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "ehrshot:subject=lab_anemia,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "ehrshot:subject=new_hypertension,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "ehrshot:subject=lab_anemia,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "ehrshot:subject=new_hypertension,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 35676.975666666665,
          "description": "min=32182.049, mean=35676.976, max=38197.948, sum=107030.927 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "ehrshot:subject=lab_anemia,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "ehrshot:subject=new_hypertension,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=3 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "ehrshot:subject=lab_anemia,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "ehrshot:subject=new_hypertension,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 149.213,
          "description": "min=149.213, mean=149.213, max=149.213, sum=149.213 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 308.0,
          "description": "min=308, mean=308, max=308, sum=308 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 332.262987012987,
          "description": "min=332.263, mean=332.263, max=332.263, sum=332.263 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 149.0,
          "description": "min=149, mean=149, max=149, sum=149 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 55486.93288590604,
          "description": "min=55486.933, mean=55486.933, max=55486.933, sum=55486.933 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 154.68456375838926,
          "description": "min=154.685, mean=154.685, max=154.685, sum=154.685 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 423.0,
          "description": "min=423, mean=423, max=423, sum=423 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 280.2434988179669,
          "description": "min=280.243, mean=280.243, max=280.243, sum=280.243 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 915.0,
          "description": "min=915, mean=915, max=915, sum=915 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 735.6131147540983,
          "description": "min=735.613, mean=735.613, max=735.613, sum=735.613 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 3390.381,
          "description": "min=3390.381, mean=3390.381, max=3390.381, sum=3390.381 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 299.734,
          "description": "min=299.734, mean=299.734, max=299.734, sum=299.734 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 120.0,
          "description": "min=120, mean=120, max=120, sum=120 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1573.6416666666667,
          "description": "min=1573.642, mean=1573.642, max=1573.642, sum=1573.642 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 434.18333333333334,
          "description": "min=434.183, mean=434.183, max=434.183, sum=434.183 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 128.0,
          "description": "min=128, mean=128, max=128, sum=128 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 297.8984375,
          "description": "min=297.898, mean=297.898, max=297.898, sum=297.898 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 506.8984375,
          "description": "min=506.898, mean=506.898, max=506.898, sum=506.898 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 207.052,
          "description": "min=207.052, mean=207.052, max=207.052, sum=207.052 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 91.136,
          "description": "min=91.136, mean=91.136, max=91.136, sum=91.136 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 100.0,
          "description": "min=100, mean=100, max=100, sum=100 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 624.83,
          "description": "min=624.83, mean=624.83, max=624.83, sum=624.83 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 217.87,
          "description": "min=217.87, mean=217.87, max=217.87, sum=217.87 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 487.0,
          "description": "min=487, mean=487, max=487, sum=487 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 707.9425051334703,
          "description": "min=707.943, mean=707.943, max=707.943, sum=707.943 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 211.07186858316223,
          "description": "min=211.072, mean=211.072, max=211.072, sum=211.072 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 689.0,
          "description": "min=689, mean=689, max=689, sum=689 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 22.355587808417997,
          "description": "min=22.356, mean=22.356, max=22.356, sum=22.356 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 202.63570391872278,
          "description": "min=202.636, mean=202.636, max=202.636, sum=202.636 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 361.0,
          "description": "min=361, mean=361, max=361, sum=361 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 2567.2714681440443,
          "description": "min=2567.271, mean=2567.271, max=2567.271, sum=2567.271 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 256.0,
          "description": "min=256, mean=256, max=256, sum=256 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=2000 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "med_dialog,subset=icliniq:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "med_dialog,subset=icliniq:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "med_dialog,subset=icliniq:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 258.995,
          "description": "min=236.85, mean=258.995, max=281.14, sum=517.99 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "med_dialog,subset=icliniq:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 48.3285,
          "description": "min=47.717, mean=48.328, max=48.94, sum=96.657 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "med_dialog,subset=icliniq:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 2042.08,
          "description": "min=2042.08, mean=2042.08, max=2042.08, sum=2042.08 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 150.0,
          "description": "min=150, mean=150, max=150, sum=150 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 37.626666666666665,
          "description": "min=37.627, mean=37.627, max=37.627, sum=37.627 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 369.38,
          "description": "min=369.38, mean=369.38, max=369.38, sum=369.38 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 67.0,
          "description": "min=67, mean=67, max=67, sum=67 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 669.8059701492538,
          "description": "min=669.806, mean=669.806, max=669.806, sum=669.806 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 94.14925373134328,
          "description": "min=94.149, mean=94.149, max=94.149, sum=94.149 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 300.0,
          "description": "min=300, mean=300, max=300, sum=300 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 179.92,
          "description": "min=179.92, mean=179.92, max=179.92, sum=179.92 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 300.0,
          "description": "min=300, mean=300, max=300, sum=300 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 254.18666666666667,
          "description": "min=254.187, mean=254.187, max=254.187, sum=254.187 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 371.822,
          "description": "min=371.822, mean=371.822, max=371.822, sum=371.822 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 978.287,
          "description": "min=978.287, mean=978.287, max=978.287, sum=978.287 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 48.703,
          "description": "min=48.703, mean=48.703, max=48.703, sum=48.703 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 220.0,
          "description": "min=220, mean=220, max=220, sum=220 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 2071.9545454545455,
          "description": "min=2071.955, mean=2071.955, max=2071.955, sum=2071.955 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 167.0,
          "description": "min=167, mean=167, max=167, sum=167 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 468.88023952095807,
          "description": "min=468.88, mean=468.88, max=468.88, sum=468.88 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 86.0,
          "description": "min=86, mean=86, max=86, sum=258 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "n2c2_ct_matching:subject=CREATININE,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "n2c2_ct_matching:subject=CREATININE,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "n2c2_ct_matching:subject=CREATININE,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 5477.837209302325,
          "description": "min=5428.837, mean=5477.837, max=5553.837, sum=16433.512 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "n2c2_ct_matching:subject=CREATININE,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=3 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "n2c2_ct_matching:subject=CREATININE,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 701.975,
          "description": "min=701.975, mean=701.975, max=701.975, sum=701.975 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 22903.633,
          "description": "min=22903.633, mean=22903.633, max=22903.633, sum=22903.633 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimiciv_billing_code:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "value": 327.0,
          "description": "min=327, mean=327, max=327, sum=327 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 39603.12844036697,
          "description": "min=39603.128, mean=39603.128, max=39603.128, sum=39603.128 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 62667.633,
          "description": "min=62667.633, mean=62667.633, max=62667.633, sum=62667.633 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 15195.841,
          "description": "min=15195.841, mean=15195.841, max=15195.841, sum=15195.841 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        }
      ],
      [
        {
          "value": "GPT-4o mini (2024-07-18)",
          "description": "",
          "markdown": false
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 570.684,
          "description": "min=570.684, mean=570.684, max=570.684, sum=570.684 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 2.071,
          "description": "min=2.071, mean=2.071, max=2.071, sum=2.071 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 78.61538461538461,
          "description": "min=28, mean=78.615, max=100, sum=1022 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=chronic_pain,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=homelessness,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=liver_disease,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=major_depression,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=personality_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=unemployment,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=chronic_pain,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=homelessness,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=liver_disease,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=major_depression,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=personality_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=unemployment,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=chronic_pain,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=homelessness,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=liver_disease,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=major_depression,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=personality_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=unemployment,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 483.5058692520385,
          "description": "min=443.989, mean=483.506, max=519.567, sum=6285.576 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=chronic_pain,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=homelessness,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=liver_disease,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=major_depression,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=personality_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=unemployment,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.9984615384615385,
          "description": "min=0.99, mean=0.998, max=1, sum=12.98 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=chronic_pain,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=homelessness,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=liver_disease,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=major_depression,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=personality_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=unemployment,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 427.0,
          "description": "min=427, mean=427, max=427, sum=427 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 771.903981264637,
          "description": "min=771.904, mean=771.904, max=771.904, sum=771.904 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 504.9695550351288,
          "description": "min=504.97, mean=504.97, max=504.97, sum=504.97 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 597.0,
          "description": "min=597, mean=597, max=597, sum=597 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 308.2579564489112,
          "description": "min=308.258, mean=308.258, max=308.258, sum=308.258 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 25.284757118927974,
          "description": "min=25.285, mean=25.285, max=25.285, sum=25.285 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "ehrshot:subject=lab_anemia,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "ehrshot:subject=new_hypertension,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "ehrshot:subject=lab_anemia,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "ehrshot:subject=new_hypertension,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "ehrshot:subject=lab_anemia,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "ehrshot:subject=new_hypertension,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 35676.975666666665,
          "description": "min=32182.049, mean=35676.976, max=38197.948, sum=107030.927 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "ehrshot:subject=lab_anemia,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "ehrshot:subject=new_hypertension,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=3 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "ehrshot:subject=lab_anemia,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "ehrshot:subject=new_hypertension,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 149.213,
          "description": "min=149.213, mean=149.213, max=149.213, sum=149.213 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 308.0,
          "description": "min=308, mean=308, max=308, sum=308 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 332.262987012987,
          "description": "min=332.263, mean=332.263, max=332.263, sum=332.263 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 149.0,
          "description": "min=149, mean=149, max=149, sum=149 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 55486.93288590604,
          "description": "min=55486.933, mean=55486.933, max=55486.933, sum=55486.933 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 160.46979865771812,
          "description": "min=160.47, mean=160.47, max=160.47, sum=160.47 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 423.0,
          "description": "min=423, mean=423, max=423, sum=423 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 280.2434988179669,
          "description": "min=280.243, mean=280.243, max=280.243, sum=280.243 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 915.0,
          "description": "min=915, mean=915, max=915, sum=915 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 735.6131147540983,
          "description": "min=735.613, mean=735.613, max=735.613, sum=735.613 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 3390.381,
          "description": "min=3390.381, mean=3390.381, max=3390.381, sum=3390.381 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 295.109,
          "description": "min=295.109, mean=295.109, max=295.109, sum=295.109 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 120.0,
          "description": "min=120, mean=120, max=120, sum=120 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1573.6416666666667,
          "description": "min=1573.642, mean=1573.642, max=1573.642, sum=1573.642 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 427.8333333333333,
          "description": "min=427.833, mean=427.833, max=427.833, sum=427.833 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 128.0,
          "description": "min=128, mean=128, max=128, sum=128 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 297.8984375,
          "description": "min=297.898, mean=297.898, max=297.898, sum=297.898 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 505.53125,
          "description": "min=505.531, mean=505.531, max=505.531, sum=505.531 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 207.052,
          "description": "min=207.052, mean=207.052, max=207.052, sum=207.052 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 90.395,
          "description": "min=90.395, mean=90.395, max=90.395, sum=90.395 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 100.0,
          "description": "min=100, mean=100, max=100, sum=100 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 624.83,
          "description": "min=624.83, mean=624.83, max=624.83, sum=624.83 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 139.81,
          "description": "min=139.81, mean=139.81, max=139.81, sum=139.81 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 487.0,
          "description": "min=487, mean=487, max=487, sum=487 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 707.9425051334703,
          "description": "min=707.943, mean=707.943, max=707.943, sum=707.943 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 206.6447638603696,
          "description": "min=206.645, mean=206.645, max=206.645, sum=206.645 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 689.0,
          "description": "min=689, mean=689, max=689, sum=689 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 22.355587808417997,
          "description": "min=22.356, mean=22.356, max=22.356, sum=22.356 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 184.02031930333817,
          "description": "min=184.02, mean=184.02, max=184.02, sum=184.02 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 361.0,
          "description": "min=361, mean=361, max=361, sum=361 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 2567.2714681440443,
          "description": "min=2567.271, mean=2567.271, max=2567.271, sum=2567.271 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 256.0,
          "description": "min=256, mean=256, max=256, sum=256 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=2000 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "med_dialog,subset=icliniq:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "med_dialog,subset=icliniq:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "med_dialog,subset=icliniq:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 258.995,
          "description": "min=236.85, mean=258.995, max=281.14, sum=517.99 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "med_dialog,subset=icliniq:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 44.437,
          "description": "min=43.741, mean=44.437, max=45.133, sum=88.874 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "med_dialog,subset=icliniq:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 2042.08,
          "description": "min=2042.08, mean=2042.08, max=2042.08, sum=2042.08 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 150.0,
          "description": "min=150, mean=150, max=150, sum=150 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 37.626666666666665,
          "description": "min=37.627, mean=37.627, max=37.627, sum=37.627 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 339.56666666666666,
          "description": "min=339.567, mean=339.567, max=339.567, sum=339.567 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 67.0,
          "description": "min=67, mean=67, max=67, sum=67 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 669.8059701492538,
          "description": "min=669.806, mean=669.806, max=669.806, sum=669.806 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 100.0,
          "description": "min=100, mean=100, max=100, sum=100 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 300.0,
          "description": "min=300, mean=300, max=300, sum=300 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 179.92,
          "description": "min=179.92, mean=179.92, max=179.92, sum=179.92 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 300.0,
          "description": "min=300, mean=300, max=300, sum=300 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 254.18666666666667,
          "description": "min=254.187, mean=254.187, max=254.187, sum=254.187 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 371.822,
          "description": "min=371.822, mean=371.822, max=371.822, sum=371.822 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 978.287,
          "description": "min=978.287, mean=978.287, max=978.287, sum=978.287 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 66.542,
          "description": "min=66.542, mean=66.542, max=66.542, sum=66.542 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 220.0,
          "description": "min=220, mean=220, max=220, sum=220 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 2071.9545454545455,
          "description": "min=2071.955, mean=2071.955, max=2071.955, sum=2071.955 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 167.0,
          "description": "min=167, mean=167, max=167, sum=167 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 468.88023952095807,
          "description": "min=468.88, mean=468.88, max=468.88, sum=468.88 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 86.0,
          "description": "min=86, mean=86, max=86, sum=258 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "n2c2_ct_matching:subject=CREATININE,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "n2c2_ct_matching:subject=CREATININE,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "n2c2_ct_matching:subject=CREATININE,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 5477.837209302325,
          "description": "min=5428.837, mean=5477.837, max=5553.837, sum=16433.512 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "n2c2_ct_matching:subject=CREATININE,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=3 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "n2c2_ct_matching:subject=CREATININE,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 701.975,
          "description": "min=701.975, mean=701.975, max=701.975, sum=701.975 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 22903.633,
          "description": "min=22903.633, mean=22903.633, max=22903.633, sum=22903.633 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimiciv_billing_code:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "value": 327.0,
          "description": "min=327, mean=327, max=327, sum=327 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 39603.12844036697,
          "description": "min=39603.128, mean=39603.128, max=39603.128, sum=39603.128 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 62667.633,
          "description": "min=62667.633, mean=62667.633, max=62667.633, sum=62667.633 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 15195.841,
          "description": "min=15195.841, mean=15195.841, max=15195.841, sum=15195.841 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        }
      ],
      [
        {
          "value": "Llama 3.3 Instruct (70B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 584.571,
          "description": "min=584.571, mean=584.571, max=584.571, sum=584.571 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 4.27,
          "description": "min=4.27, mean=4.27, max=4.27, sum=4.27 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 78.61538461538461,
          "description": "min=28, mean=78.615, max=100, sum=1022 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=chronic_pain,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=homelessness,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=liver_disease,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=major_depression,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=personality_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=unemployment,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=chronic_pain,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=homelessness,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=liver_disease,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=major_depression,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=personality_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=unemployment,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=chronic_pain,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=homelessness,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=liver_disease,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=major_depression,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=personality_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=unemployment,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 489.01692165679975,
          "description": "min=447.424, mean=489.017, max=525.955, sum=6357.22 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=chronic_pain,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=homelessness,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=liver_disease,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=major_depression,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=personality_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=unemployment,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.9984615384615385,
          "description": "min=0.99, mean=0.998, max=1, sum=12.98 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=chronic_pain,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=homelessness,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=liver_disease,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=major_depression,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=personality_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=unemployment,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 427.0,
          "description": "min=427, mean=427, max=427, sum=427 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 788.0538641686182,
          "description": "min=788.054, mean=788.054, max=788.054, sum=788.054 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 443.0234192037471,
          "description": "min=443.023, mean=443.023, max=443.023, sum=443.023 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 597.0,
          "description": "min=597, mean=597, max=597, sum=597 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 311.5812395309883,
          "description": "min=311.581, mean=311.581, max=311.581, sum=311.581 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 171.64489112227807,
          "description": "min=171.645, mean=171.645, max=171.645, sum=171.645 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "ehrshot:subject=lab_anemia,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "ehrshot:subject=new_hypertension,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "ehrshot:subject=lab_anemia,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "ehrshot:subject=new_hypertension,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "ehrshot:subject=lab_anemia,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "ehrshot:subject=new_hypertension,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 32868.43833333333,
          "description": "min=29647.98, mean=32868.438, max=35141.782, sum=98605.315 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "ehrshot:subject=lab_anemia,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "ehrshot:subject=new_hypertension,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=3 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "ehrshot:subject=lab_anemia,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "ehrshot:subject=new_hypertension,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 150.567,
          "description": "min=150.567, mean=150.567, max=150.567, sum=150.567 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 308.0,
          "description": "min=308, mean=308, max=308, sum=308 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 337.39935064935065,
          "description": "min=337.399, mean=337.399, max=337.399, sum=337.399 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 149.0,
          "description": "min=149, mean=149, max=149, sum=149 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 55932.718120805366,
          "description": "min=55932.718, mean=55932.718, max=55932.718, sum=55932.718 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 138.00671140939596,
          "description": "min=138.007, mean=138.007, max=138.007, sum=138.007 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 423.0,
          "description": "min=423, mean=423, max=423, sum=423 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 283.0118203309693,
          "description": "min=283.012, mean=283.012, max=283.012, sum=283.012 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 915.0,
          "description": "min=915, mean=915, max=915, sum=915 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 746.248087431694,
          "description": "min=746.248, mean=746.248, max=746.248, sum=746.248 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 3471.207,
          "description": "min=3471.207, mean=3471.207, max=3471.207, sum=3471.207 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 295.329,
          "description": "min=295.329, mean=295.329, max=295.329, sum=295.329 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 120.0,
          "description": "min=120, mean=120, max=120, sum=120 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1629.5833333333333,
          "description": "min=1629.583, mean=1629.583, max=1629.583, sum=1629.583 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 428.6666666666667,
          "description": "min=428.667, mean=428.667, max=428.667, sum=428.667 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 128.0,
          "description": "min=128, mean=128, max=128, sum=128 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 304.1640625,
          "description": "min=304.164, mean=304.164, max=304.164, sum=304.164 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 456.828125,
          "description": "min=456.828, mean=456.828, max=456.828, sum=456.828 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 210.318,
          "description": "min=210.318, mean=210.318, max=210.318, sum=210.318 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 54.236,
          "description": "min=54.236, mean=54.236, max=54.236, sum=54.236 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 100.0,
          "description": "min=100, mean=100, max=100, sum=100 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 643.16,
          "description": "min=643.16, mean=643.16, max=643.16, sum=643.16 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 118.91,
          "description": "min=118.91, mean=118.91, max=118.91, sum=118.91 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 487.0,
          "description": "min=487, mean=487, max=487, sum=487 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 723.6078028747434,
          "description": "min=723.608, mean=723.608, max=723.608, sum=723.608 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 167.82340862422998,
          "description": "min=167.823, mean=167.823, max=167.823, sum=167.823 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 689.0,
          "description": "min=689, mean=689, max=689, sum=689 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 22.6966618287373,
          "description": "min=22.697, mean=22.697, max=22.697, sum=22.697 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 295.51959361393324,
          "description": "min=295.52, mean=295.52, max=295.52, sum=295.52 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 361.0,
          "description": "min=361, mean=361, max=361, sum=361 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 2616.9639889196674,
          "description": "min=2616.964, mean=2616.964, max=2616.964, sum=2616.964 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 255.98060941828254,
          "description": "min=255.981, mean=255.981, max=255.981, sum=255.981 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=2000 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "med_dialog,subset=icliniq:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "med_dialog,subset=icliniq:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "med_dialog,subset=icliniq:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 261.095,
          "description": "min=239.772, mean=261.095, max=282.418, sum=522.19 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "med_dialog,subset=icliniq:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 51.19,
          "description": "min=50.736, mean=51.19, max=51.644, sum=102.38 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "med_dialog,subset=icliniq:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 2070.752,
          "description": "min=2070.752, mean=2070.752, max=2070.752, sum=2070.752 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 150.0,
          "description": "min=150, mean=150, max=150, sum=150 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 38.16,
          "description": "min=38.16, mean=38.16, max=38.16, sum=38.16 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 486.67333333333335,
          "description": "min=486.673, mean=486.673, max=486.673, sum=486.673 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 67.0,
          "description": "min=67, mean=67, max=67, sum=67 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 685.4179104477612,
          "description": "min=685.418, mean=685.418, max=685.418, sum=685.418 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 100.04477611940298,
          "description": "min=100.045, mean=100.045, max=100.045, sum=100.045 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 300.0,
          "description": "min=300, mean=300, max=300, sum=300 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 183.76,
          "description": "min=183.76, mean=183.76, max=183.76, sum=183.76 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 300.0,
          "description": "min=300, mean=300, max=300, sum=300 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 257.2866666666667,
          "description": "min=257.287, mean=257.287, max=257.287, sum=257.287 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 376.774,
          "description": "min=376.774, mean=376.774, max=376.774, sum=376.774 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1.102,
          "description": "min=1.102, mean=1.102, max=1.102, sum=1.102 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 962.504,
          "description": "min=962.504, mean=962.504, max=962.504, sum=962.504 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 61.127,
          "description": "min=61.127, mean=61.127, max=61.127, sum=61.127 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 220.0,
          "description": "min=220, mean=220, max=220, sum=220 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 2091.018181818182,
          "description": "min=2091.018, mean=2091.018, max=2091.018, sum=2091.018 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 167.0,
          "description": "min=167, mean=167, max=167, sum=167 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 472.91616766467064,
          "description": "min=472.916, mean=472.916, max=472.916, sum=472.916 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 86.0,
          "description": "min=86, mean=86, max=86, sum=258 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "n2c2_ct_matching:subject=CREATININE,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "n2c2_ct_matching:subject=CREATININE,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "n2c2_ct_matching:subject=CREATININE,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 5588.84496124031,
          "description": "min=5538.512, mean=5588.845, max=5665.512, sum=16766.535 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "n2c2_ct_matching:subject=CREATININE,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=3 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "n2c2_ct_matching:subject=CREATININE,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 708.639,
          "description": "min=708.639, mean=708.639, max=708.639, sum=708.639 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 23367.678,
          "description": "min=23367.678, mean=23367.678, max=23367.678, sum=23367.678 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimiciv_billing_code:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "value": 327.0,
          "description": "min=327, mean=327, max=327, sum=327 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 40350.51376146789,
          "description": "min=40350.514, mean=40350.514, max=40350.514, sum=40350.514 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 63948.647,
          "description": "min=63948.647, mean=63948.647, max=63948.647, sum=63948.647 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 15443.736,
          "description": "min=15443.736, mean=15443.736, max=15443.736, sum=15443.736 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        }
      ],
      [
        {
          "value": "o3-mini (2025-01-31)",
          "description": "",
          "markdown": false
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 585.063,
          "description": "min=585.063, mean=585.063, max=585.063, sum=585.063 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 2.471,
          "description": "min=2.471, mean=2.471, max=2.471, sum=2.471 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 78.61538461538461,
          "description": "min=28, mean=78.615, max=100, sum=1022 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=bipolar_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=chronic_pain,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=homelessness,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=liver_disease,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=major_depression,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=personality_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=substance_use_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=suicidal_behavior,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=tobacco_dependence,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=unemployment,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=bipolar_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=chronic_pain,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=homelessness,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=liver_disease,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=major_depression,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=personality_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=substance_use_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=suicidal_behavior,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=tobacco_dependence,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=unemployment,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=bipolar_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=chronic_pain,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=homelessness,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=liver_disease,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=major_depression,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=personality_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=substance_use_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=suicidal_behavior,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=tobacco_dependence,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=unemployment,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 489.1569475450976,
          "description": "min=447.511, mean=489.157, max=526.164, sum=6359.04 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=bipolar_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=chronic_pain,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=homelessness,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=liver_disease,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=major_depression,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=personality_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=substance_use_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=suicidal_behavior,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=tobacco_dependence,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=unemployment,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.9984615384615385,
          "description": "min=0.99, mean=0.998, max=1, sum=12.98 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=bipolar_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=chronic_pain,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=homelessness,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=liver_disease,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=major_depression,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=personality_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=substance_use_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=suicidal_behavior,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=tobacco_dependence,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=unemployment,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 427.0,
          "description": "min=427, mean=427, max=427, sum=427 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 789.3325526932084,
          "description": "min=789.333, mean=789.333, max=789.333, sum=789.333 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 631.599531615925,
          "description": "min=631.6, mean=631.6, max=631.6, sum=631.6 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 597.0,
          "description": "min=597, mean=597, max=597, sum=597 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 311.64489112227807,
          "description": "min=311.645, mean=311.645, max=311.645, sum=311.645 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 22.768844221105528,
          "description": "min=22.769, mean=22.769, max=22.769, sum=22.769 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "ehrshot:subject=lab_anemia,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "ehrshot:subject=new_hypertension,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "ehrshot:subject=lab_anemia,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "ehrshot:subject=new_hypertension,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "ehrshot:subject=lab_anemia,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "ehrshot:subject=new_hypertension,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 32868.44466666667,
          "description": "min=29647.986, mean=32868.445, max=35141.789, sum=98605.334 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "ehrshot:subject=lab_anemia,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "ehrshot:subject=new_hypertension,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.9993333333333333,
          "description": "min=0.997, mean=0.999, max=1.001, sum=2.998 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "ehrshot:subject=lab_anemia,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "ehrshot:subject=new_hypertension,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 150.624,
          "description": "min=150.624, mean=150.624, max=150.624, sum=150.624 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.988,
          "description": "min=0.988, mean=0.988, max=0.988, sum=0.988 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 308.0,
          "description": "min=308, mean=308, max=308, sum=308 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 337.487012987013,
          "description": "min=337.487, mean=337.487, max=337.487, sum=337.487 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.9902597402597403,
          "description": "min=0.99, mean=0.99, max=0.99, sum=0.99 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 149.0,
          "description": "min=149, mean=149, max=149, sum=149 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 55993.476510067114,
          "description": "min=55993.477, mean=55993.477, max=55993.477, sum=55993.477 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 185.73825503355704,
          "description": "min=185.738, mean=185.738, max=185.738, sum=185.738 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 423.0,
          "description": "min=423, mean=423, max=423, sum=423 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 284.16784869976357,
          "description": "min=284.168, mean=284.168, max=284.168, sum=284.168 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 915.0,
          "description": "min=915, mean=915, max=915, sum=915 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 754.4939890710383,
          "description": "min=754.494, mean=754.494, max=754.494, sum=754.494 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 3474.609,
          "description": "min=3474.609, mean=3474.609, max=3474.609, sum=3474.609 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 548.381,
          "description": "min=548.381, mean=548.381, max=548.381, sum=548.381 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 120.0,
          "description": "min=120, mean=120, max=120, sum=120 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1629.6166666666666,
          "description": "min=1629.617, mean=1629.617, max=1629.617, sum=1629.617 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 491.2583333333333,
          "description": "min=491.258, mean=491.258, max=491.258, sum=491.258 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 128.0,
          "description": "min=128, mean=128, max=128, sum=128 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 304.84375,
          "description": "min=304.844, mean=304.844, max=304.844, sum=304.844 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 684.5703125,
          "description": "min=684.57, mean=684.57, max=684.57, sum=684.57 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 210.371,
          "description": "min=210.371, mean=210.371, max=210.371, sum=210.371 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 87.559,
          "description": "min=87.559, mean=87.559, max=87.559, sum=87.559 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 100.0,
          "description": "min=100, mean=100, max=100, sum=100 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 644.34,
          "description": "min=644.34, mean=644.34, max=644.34, sum=644.34 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 158.65,
          "description": "min=158.65, mean=158.65, max=158.65, sum=158.65 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 487.0,
          "description": "min=487, mean=487, max=487, sum=487 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 723.8459958932239,
          "description": "min=723.846, mean=723.846, max=723.846, sum=723.846 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 145.43326488706364,
          "description": "min=145.433, mean=145.433, max=145.433, sum=145.433 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 689.0,
          "description": "min=689, mean=689, max=689, sum=689 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 22.712626995645863,
          "description": "min=22.713, mean=22.713, max=22.713, sum=22.713 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 259.14513788098697,
          "description": "min=259.145, mean=259.145, max=259.145, sum=259.145 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 361.0,
          "description": "min=361, mean=361, max=361, sum=361 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 2622.0193905817173,
          "description": "min=2622.019, mean=2622.019, max=2622.019, sum=2622.019 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 737.5761772853185,
          "description": "min=737.576, mean=737.576, max=737.576, sum=737.576 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=2000 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "med_dialog,subset=icliniq:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "med_dialog,subset=icliniq:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "med_dialog,subset=icliniq:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 261.2355,
          "description": "min=239.917, mean=261.236, max=282.554, sum=522.471 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "med_dialog,subset=icliniq:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 54.4685,
          "description": "min=54.326, mean=54.468, max=54.611, sum=108.937 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "med_dialog,subset=icliniq:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 2075.158,
          "description": "min=2075.158, mean=2075.158, max=2075.158, sum=2075.158 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 150.0,
          "description": "min=150, mean=150, max=150, sum=150 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 38.16,
          "description": "min=38.16, mean=38.16, max=38.16, sum=38.16 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 403.76666666666665,
          "description": "min=403.767, mean=403.767, max=403.767, sum=403.767 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 67.0,
          "description": "min=67, mean=67, max=67, sum=67 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 685.4179104477612,
          "description": "min=685.418, mean=685.418, max=685.418, sum=685.418 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 214.62686567164178,
          "description": "min=214.627, mean=214.627, max=214.627, sum=214.627 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 300.0,
          "description": "min=300, mean=300, max=300, sum=300 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 183.78666666666666,
          "description": "min=183.787, mean=183.787, max=183.787, sum=183.787 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 300.0,
          "description": "min=300, mean=300, max=300, sum=300 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 257.33,
          "description": "min=257.33, mean=257.33, max=257.33, sum=257.33 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 377.515,
          "description": "min=377.515, mean=377.515, max=377.515, sum=377.515 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.987,
          "description": "min=0.987, mean=0.987, max=0.987, sum=0.987 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 963.516,
          "description": "min=963.516, mean=963.516, max=963.516, sum=963.516 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 60.344,
          "description": "min=60.344, mean=60.344, max=60.344, sum=60.344 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 220.0,
          "description": "min=220, mean=220, max=220, sum=220 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 2095.2636363636366,
          "description": "min=2095.264, mean=2095.264, max=2095.264, sum=2095.264 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 167.0,
          "description": "min=167, mean=167, max=167, sum=167 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 473.0239520958084,
          "description": "min=473.024, mean=473.024, max=473.024, sum=473.024 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 86.0,
          "description": "min=86, mean=86, max=86, sum=258 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "n2c2_ct_matching:subject=ADVANCED-CAD,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "n2c2_ct_matching:subject=CREATININE,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "n2c2_ct_matching:subject=ADVANCED-CAD,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "n2c2_ct_matching:subject=CREATININE,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "n2c2_ct_matching:subject=ADVANCED-CAD,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "n2c2_ct_matching:subject=CREATININE,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 5594.158914728682,
          "description": "min=5543.826, mean=5594.159, max=5670.826, sum=16782.477 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "n2c2_ct_matching:subject=ADVANCED-CAD,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "n2c2_ct_matching:subject=CREATININE,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=3 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "n2c2_ct_matching:subject=ADVANCED-CAD,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "n2c2_ct_matching:subject=CREATININE,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 708.757,
          "description": "min=708.757, mean=708.757, max=708.757, sum=708.757 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.993,
          "description": "min=0.993, mean=0.993, max=0.993, sum=0.993 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.001,
          "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 23387.861,
          "description": "min=23387.861, mean=23387.861, max=23387.861, sum=23387.861 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1.065,
          "description": "min=1.065, mean=1.065, max=1.065, sum=1.065 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimiciv_billing_code:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "value": 327.0,
          "description": "min=327, mean=327, max=327, sum=327 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.009174311926605505,
          "description": "min=0.009, mean=0.009, max=0.009, sum=0.009 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 40377.91743119266,
          "description": "min=40377.917, mean=40377.917, max=40377.917, sum=40377.917 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.018,
          "description": "min=0.018, mean=0.018, max=0.018, sum=0.018 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 63970.858,
          "description": "min=63970.858, mean=63970.858, max=63970.858, sum=63970.858 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1.736,
          "description": "min=1.736, mean=1.736, max=1.736, sum=1.736 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
      [
  {
    "title": "Accuracy",
    "header": [
      {
        "value": "Model",
        "markdown": false,
        "metadata": {}
      },
      {
        "value": "Mean win rate",
        "description": "How many models this model outperforms on average (over columns).",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {}
      },
      {
        "value": "MedCalc-Bench - MedCalc Accuracy",
        "description": "A dataset which consists of a patient note, a question requesting to compute a specific medical value, and a ground truth answer [(Khandekar et al., 2024)](https://arxiv.org/abs/2406.12036).\n\nMedCalc Accuracy: Comparison based on category. Exact match for categories risk, severity and diagnosis. Check if within range for the other categories.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "MedCalc Accuracy",
          "run_group": "MedCalc-Bench"
        }
      },
      {
        "value": "CLEAR - EM",
        "description": "A dataset for evaluating the presence of a specific medical condition from patient notes with yes/no/maybe classifications [(Lopez et al., 2025)](https://www.nature.com/articles/s41746-024-01377-1).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EM",
          "run_group": "CLEAR"
        }
      },
      {
        "value": "MTSamples - Jury Score",
        "description": "A dataset of clinical notes where the model is prompted to generate the appropriate treatment plan for this patient [(MTSamples, 2025)](https://mtsamples.com).\n\nMTSamples Replicate Jury Score: Measures the average score assigned by an LLM-based jury evaluating task performance.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "Jury Score",
          "run_group": "MTSamples"
        }
      },
      {
        "value": "Medec - MedecFlagAcc",
        "description": "A dataset containing medical narratives with error detection and correction pairs [(Abacha et al., 2025)](https://arxiv.org/abs/2412.19260).\n\nMedical Error Flag Accuracy: Measures how accurately the model identifies whether a clinical note contains an error (binary classification of correct/incorrect).",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "MedecFlagAcc",
          "run_group": "Medec"
        }
      },
      {
        "value": "EHRSHOT - EM",
        "description": "A dataset given a patient record of EHR codes, classifying if an event will occur at a future date or not [(Wornow et al., 2023)](https://arxiv.org/abs/2307.02028).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EM",
          "run_group": "EHRSHOT"
        }
      },
      {
        "value": "HeadQA - EM",
        "description": "A collection of biomedical multiple-choice questions for testing medical knowledge [(Vilares et al., 2019)](https://arxiv.org/abs/1906.04701).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EM",
          "run_group": "HeadQA"
        }
      },
      {
        "value": "Medbullets - EM",
        "description": "A USMLE-style medical question dataset with multiple-choice answers and explanations [(MedBullets, 2025)](https://step2.medbullets.com).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EM",
          "run_group": "Medbullets"
        }
      },
      {
        "value": "MedAlign - Jury Score",
        "description": "A dataset that asks models to answer questions/follow instructions over longitudinal EHR [(Fleming et al., 2023)](https://arxiv.org/abs/2308.14089).\n\nMedalign Jury Score: Measures the average score assigned by an LLM-based jury evaluating task performance.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "Jury Score",
          "run_group": "MedAlign"
        }
      },
      {
        "value": "ADHD-Behavior - EM",
        "description": "A dataset that classifies whether a clinical note contains a clinician recommendation for parent training in behavior management, which is the first-line evidence-based treatment for young children with ADHD [(Pillai et al., 2024)](https://doi.org/10.1093/jamia/ocae001).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EM",
          "run_group": "ADHD-Behavior"
        }
      },
      {
        "value": "ADHD-MedEffects - EM",
        "description": "A dataset that classifies whether a clinical note contains documentation of side effect monitoring (recording of absence or presence of medication side effects), as recommended in clinical practice guidelines [(Bannet et al., 2024)](https://doi.org/10.1542/peds.2024-067223).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EM",
          "run_group": "ADHD-MedEffects"
        }
      },
      {
        "value": "DischargeMe - Jury Score",
        "description": "DischargeMe is a discharge instruction generation dataset and brief hospital course generation dataset collected from MIMIC-IV data, considering only the discharge text as well as the radiology report text [(Xu, 2024)](https://physionet.org/content/discharge-me/1.3/).\n\nDischargeMe Jury Score: Measures the average score assigned by an LLM-based jury evaluating task performance.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "Jury Score",
          "run_group": "DischargeMe"
        }
      },
      {
        "value": "ACI-Bench - Jury Score",
        "description": "A dataset of patient-doctor conversations paired with structured clinical notes [(Yim et al., 2024)](https://www.nature.com/articles/s41597-023-02487-3).\n\nACI-Bench Jury Score: Measures the average score assigned by an LLM-based jury evaluating task performance.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "Jury Score",
          "run_group": "ACI-Bench"
        }
      },
      {
        "value": "MTSamples Procedures - Jury Score",
        "description": "A dataset that provides a patient note regarding an operation, with the objective to document the procedure.\n\nMTSamples Procedures Jury Score: Measures the average score assigned by an LLM-based jury evaluating task performance.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "Jury Score",
          "run_group": "MTSamples Procedures"
        }
      },
      {
        "value": "MIMIC-RRS - Jury Score",
        "description": "A dataset containing radiology reports with findings sections from MIMIC-III paired with their corresponding impression sections, used for generating radiology report summaries [(Chen et al., 2023)](https://arxiv.org/abs/2211.08584).\n\nMIMIC-RRS Jury Score: Measures the average score assigned by an LLM-based jury evaluating task performance.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "Jury Score",
          "run_group": "MIMIC-RRS"
        }
      },
      {
        "value": "MIMIC-BHC - Jury Score",
        "description": "A summarization task using a curated collection of preprocessed discharge notes paired with their corresponding brief hospital course (BHC) summaries [(Aali et al., 2024)](https://doi.org/10.1093/jamia/ocae312).\n\nMIMIC-BHC Jury Score: Measures the average score assigned by an LLM-based jury evaluating task performance.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "Jury Score",
          "run_group": "MIMIC-BHC"
        }
      },
      {
        "value": "NoteExtract - Jury Score",
        "description": "A dataset containing free form text of a clinical health worker care plan, with the associated goal being to restructure that text into a given format.\n\nNoteExtract Jury Score: Measures the average score assigned by an LLM-based jury evaluating task performance.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "Jury Score",
          "run_group": "NoteExtract"
        }
      },
      {
        "value": "MedicationQA - Jury Score",
        "description": "Consumer medication questions with reference answers.\n\nMedicationQA Jury Score: Measures the average score assigned by an LLM-based jury evaluating task performance.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "Jury Score",
          "run_group": "MedicationQA"
        }
      },
      {
        "value": "PatientInstruct - Jury Score",
        "description": "A dataset containing case details used to generate customized post-procedure patient instructions.\n\nPatientInstruct Jury Score: Measures the average score assigned by an LLM-based jury evaluating task performance.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "Jury Score",
          "run_group": "PatientInstruct"
        }
      },
      {
        "value": "MedDialog - Jury Score",
        "description": "A collection of doctor-patient conversations with corresponding summaries.\n\nMedDialog Jury Score: Measures the average score assigned by an LLM-based jury evaluating task performance.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "Jury Score",
          "run_group": "MedDialog"
        }
      },
      {
        "value": "MedConfInfo - EM",
        "description": "A dataset of clinical notes from adolescent patients used to identify sensitive protected health information that should be restricted from parental access [(Rabbani et al., 2024)](https://jamanetwork.com/journals/jamapediatrics/fullarticle/2814109).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EM",
          "run_group": "MedConfInfo"
        }
      },
      {
        "value": "MEDIQA - Jury Score",
        "description": "A dataset including a medical question, a set of candidate answers, relevance annotations for ranking, and additional context to evaluate understanding and retrieval capabilities in a healthcare setting.\n\nMediQA Jury Score: Measures the average score assigned by an LLM-based jury evaluating task performance.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "Jury Score",
          "run_group": "MEDIQA"
        }
      },
      {
        "value": "MentalHealth - Jury Score",
        "description": "A dataset containing a counselor and mental health patient conversation, where the objective is to generate an empathetic counselor response.\n\nMentalHealth Jury Score: Measures the average score assigned by an LLM-based jury evaluating task performance.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "Jury Score",
          "run_group": "MentalHealth"
        }
      },
      {
        "value": "ProxySender - EM",
        "description": "This dataset features messages sent by proxy users and non proxy users, for evaluation of LLM capabilities to determine the sender [(Tse G, et al., 2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EM",
          "run_group": "ProxySender"
        }
      },
      {
        "value": "PrivacyDetection - EM",
        "description": "A dataset that determines if a message leaks any confidential information from the patient [(Tse G, et al., 2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EM",
          "run_group": "PrivacyDetection"
        }
      },
      {
        "value": "PubMedQA - EM",
        "description": "A dataset that provides pubmed abstracts and asks associated questions yes/no/maybe questions.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EM",
          "run_group": "PubMedQA"
        }
      },
      {
        "value": "EHRSQL - EHRSQLExeAcc",
        "description": "Given a natural language instruction, generate an SQL query that would be used in clinical research.\n\nExecution accuracy for Generated Query: Measures the proportion of correctly predicted answerable questions among all questions predicted to be answerable.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EHRSQLExeAcc",
          "run_group": "EHRSQL"
        }
      },
      {
        "value": "BMT-Status - EM",
        "description": "A dataset containing patient notes with associated questions and answers related to bone marrow transplantation.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EM",
          "run_group": "BMT-Status"
        }
      },
      {
        "value": "RaceBias - EM",
        "description": "A collection of LLM outputs in response to medical questions with race-based biases, with the objective being to classify whether the output contains racially biased content.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EM",
          "run_group": "RaceBias"
        }
      },
      {
        "value": "N2C2-CT - EM",
        "description": "A dataset that provides clinical notes and asks the model to classify whether the patient is a valid candidate for a provided clinical trial.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EM",
          "run_group": "N2C2-CT"
        }
      },
      {
        "value": "MedHallu - EM",
        "description": "A dataset of PubMed articles and associated questions, with the objective being to classify whether the answer is factual or hallucinated.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EM",
          "run_group": "MedHallu"
        }
      },
      {
        "value": "HospiceReferral - EM",
        "description": "A dataset evaluating performance in identifying appropriate patient referrals to hospice care.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EM",
          "run_group": "HospiceReferral"
        }
      },
      {
        "value": "MIMIC-IV Billing Code - MIMICBillingF1",
        "description": "A  dataset pairing clinical notes from MIMIC-IV with corresponding ICD-10 billing codes.\n\nF1 Score for MIMIC Billing Codes: Measures the harmonic mean of precision and recall for ICD codes, providing a balanced evaluation of the model's performance.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "MIMICBillingF1",
          "run_group": "MIMIC-IV Billing Code"
        }
      },
      {
        "value": "ClinicReferral - EM",
        "description": "A dataset containing manually curated answers to questions regarding patient referrals to the Sequoia clinic.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EM",
          "run_group": "ClinicReferral"
        }
      },
      {
        "value": "CDI-QA - EM",
        "description": "A dataset built from Clinical Document Integrity (CDI) notes, to assess the ability to answer verification questions from previous notes.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EM",
          "run_group": "CDI-QA"
        }
      },
      {
        "value": "ENT-Referral - EM",
        "description": "A dataset designed to evaluate performance in identifying appropriate patient referrals to Ear, Nose, and Throat specialists.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EM",
          "run_group": "ENT-Referral"
        }
      }
    ],
    "rows": [
      [
        {
          "value": "Claude 3.5 Sonnet (20241022)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.6339285714285714,
          "markdown": false
        },
        {
          "value": 0.218,
          "description": "min=0.218, mean=0.218, max=0.218, sum=0.218 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.6633709088997862,
          "description": "min=0.333, mean=0.663, max=0.836, sum=8.624 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=chronic_pain,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=homelessness,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=liver_disease,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=major_depression,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=personality_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=unemployment,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 4.382773874577155,
          "description": "min=4.383, mean=4.383, max=4.383, sum=4.383 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.5209380234505863,
          "description": "min=0.521, mean=0.521, max=0.521, sum=0.521 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.3583333333333334,
          "description": "min=0.136, mean=0.358, max=0.784, sum=1.075 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "ehrshot:subject=lab_anemia,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "ehrshot:subject=new_hypertension,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.906,
          "description": "min=0.906, mean=0.906, max=0.906, sum=0.906 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.6396103896103896,
          "description": "min=0.64, mean=0.64, max=0.64, sum=0.64 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 4.260999254287848,
          "description": "min=4.261, mean=4.261, max=4.261, sum=4.261 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "medalign:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.7683215130023641,
          "description": "min=0.768, mean=0.768, max=0.768, sum=0.768 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.9420765027322404,
          "description": "min=0.942, mean=0.942, max=0.942, sum=0.942 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 3.829388888888886,
          "description": "min=3.829, mean=3.829, max=3.829, sum=3.829 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 4.47037037037037,
          "description": "min=4.47, mean=4.47, max=4.47, sum=4.47 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 3.9348958333333357,
          "description": "min=3.935, mean=3.935, max=3.935, sum=3.935 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 4.478166666666664,
          "description": "min=4.478, mean=4.478, max=4.478, sum=4.478 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 4.078888888888891,
          "description": "min=4.079, mean=4.079, max=4.079, sum=4.079 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 4.948437143509013,
          "description": "min=4.948, mean=4.948, max=4.948, sum=4.948 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 4.581680374133206,
          "description": "min=4.582, mean=4.582, max=4.582, sum=4.582 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 3.7611572791628247,
          "description": "min=3.761, mean=3.761, max=3.761, sum=3.761 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 4.109611111111109,
          "description": "min=4.065, mean=4.11, max=4.155, sum=8.219 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "med_dialog,subset=icliniq:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.77,
          "description": "min=0.77, mean=0.77, max=0.77, sum=0.77 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 4.659259259259261,
          "description": "min=4.659, mean=4.659, max=4.659, sum=4.659 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 4.963515754560531,
          "description": "min=4.964, mean=4.964, max=4.964, sum=4.964 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.7366666666666667,
          "description": "min=0.737, mean=0.737, max=0.737, sum=0.737 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.8266666666666667,
          "description": "min=0.827, mean=0.827, max=0.827, sum=0.827 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.744,
          "description": "min=0.744, mean=0.744, max=0.744, sum=0.744 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.14,
          "description": "min=0.14, mean=0.14, max=0.14, sum=0.14 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.8727272727272727,
          "description": "min=0.873, mean=0.873, max=0.873, sum=0.873 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.7724550898203593,
          "description": "min=0.772, mean=0.772, max=0.772, sum=0.772 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.8333333333333334,
          "description": "min=0.756, mean=0.833, max=0.93, sum=2.5 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "n2c2_ct_matching:subject=CREATININE,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.926,
          "description": "min=0.926, mean=0.926, max=0.926, sum=0.926 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.741,
          "description": "min=0.741, mean=0.741, max=0.741, sum=0.741 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.35292484847092087,
          "description": "min=0.353, mean=0.353, max=0.353, sum=0.353 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimiciv_billing_code:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.908256880733945,
          "description": "min=0.908, mean=0.908, max=0.908, sum=0.908 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.602,
          "description": "min=0.602, mean=0.602, max=0.602, sum=0.602 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.532,
          "description": "min=0.532, mean=0.532, max=0.532, sum=0.532 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        }
      ],
      [
        {
          "value": "Claude 3.7 Sonnet (20250219)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.6357142857142857,
          "markdown": false
        },
        {
          "value": 0.21,
          "description": "min=0.21, mean=0.21, max=0.21, sum=0.21 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.6469657842337921,
          "description": "min=0.333, mean=0.647, max=0.836, sum=8.411 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=chronic_pain,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=homelessness,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=liver_disease,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=major_depression,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=personality_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=unemployment,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 4.425969294821761,
          "description": "min=4.426, mean=4.426, max=4.426, sum=4.426 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.628140703517588,
          "description": "min=0.628, mean=0.628, max=0.628, sum=0.628 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.4496666666666666,
          "description": "min=0.285, mean=0.45, max=0.715, sum=1.349 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "ehrshot:subject=lab_anemia,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "ehrshot:subject=new_hypertension,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.912,
          "description": "min=0.912, mean=0.912, max=0.912, sum=0.912 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.6493506493506493,
          "description": "min=0.649, mean=0.649, max=0.649, sum=0.649 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 4.187173750932139,
          "description": "min=4.187, mean=4.187, max=4.187, sum=4.187 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.8014184397163121,
          "description": "min=0.801, mean=0.801, max=0.801, sum=0.801 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.8163934426229508,
          "description": "min=0.816, mean=0.816, max=0.816, sum=0.816 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 3.803777777777774,
          "description": "min=3.804, mean=3.804, max=3.804, sum=3.804 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 4.617592592592595,
          "description": "min=4.618, mean=4.618, max=4.618, sum=4.618 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 4.0234375,
          "description": "min=4.023, mean=4.023, max=4.023, sum=4.023 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 4.5008888888888885,
          "description": "min=4.501, mean=4.501, max=4.501, sum=4.501 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 3.9911111111111097,
          "description": "min=3.991, mean=3.991, max=3.991, sum=3.991 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 4.949349760438056,
          "description": "min=4.949, mean=4.949, max=4.949, sum=4.949 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 4.628124496049018,
          "description": "min=4.628, mean=4.628, max=4.628, sum=4.628 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 3.6611265004616884,
          "description": "min=3.661, mean=3.661, max=3.661, sum=3.661 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 4.197555555555553,
          "description": "min=4.146, mean=4.198, max=4.249, sum=8.395 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "med_dialog,subset=icliniq:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.811,
          "description": "min=0.811, mean=0.811, max=0.811, sum=0.811 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 4.72814814814815,
          "description": "min=4.728, mean=4.728, max=4.728, sum=4.728 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 4.955223880597014,
          "description": "min=4.955, mean=4.955, max=4.955, sum=4.955 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.62,
          "description": "min=0.62, mean=0.62, max=0.62, sum=0.62 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.9966666666666667,
          "description": "min=0.997, mean=0.997, max=0.997, sum=0.997 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.625,
          "description": "min=0.625, mean=0.625, max=0.625, sum=0.625 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.081,
          "description": "min=0.081, mean=0.081, max=0.081, sum=0.081 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.9,
          "description": "min=0.9, mean=0.9, max=0.9, sum=0.9 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.6586826347305389,
          "description": "min=0.659, mean=0.659, max=0.659, sum=0.659 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.813953488372093,
          "description": "min=0.756, mean=0.814, max=0.895, sum=2.442 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "n2c2_ct_matching:subject=CREATININE,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.877,
          "description": "min=0.877, mean=0.877, max=0.877, sum=0.877 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.767,
          "description": "min=0.767, mean=0.767, max=0.767, sum=0.767 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.3554857455069554,
          "description": "min=0.355, mean=0.355, max=0.355, sum=0.355 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "mimiciv_billing_code:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.8868501529051988,
          "description": "min=0.887, mean=0.887, max=0.887, sum=0.887 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.57,
          "description": "min=0.57, mean=0.57, max=0.57, sum=0.57 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.585,
          "description": "min=0.585, mean=0.585, max=0.585, sum=0.585 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        }
      ],
      [
        {
          "value": "DeepSeek R1",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.6625,
          "style": {
            "font-weight": "bold"
          },
          "markdown": false
        },
        {
          "value": 0.348,
          "description": "min=0.348, mean=0.348, max=0.348, sum=0.348 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.8295041431536213,
          "description": "min=0.571, mean=0.83, max=0.985, sum=10.784 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=bipolar_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=chronic_pain,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=homelessness,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=liver_disease,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=major_depression,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=personality_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=substance_use_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=suicidal_behavior,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=tobacco_dependence,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=unemployment,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 4.533177205308355,
          "description": "min=4.533, mean=4.533, max=4.533, sum=4.533 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.5912897822445561,
          "description": "min=0.591, mean=0.591, max=0.591, sum=0.591 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.22033333333333335,
          "description": "min=0.019, mean=0.22, max=0.507, sum=0.661 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "ehrshot:subject=lab_anemia,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "ehrshot:subject=new_hypertension,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.721,
          "description": "min=0.721, mean=0.721, max=0.721, sum=0.721 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.6558441558441559,
          "description": "min=0.656, mean=0.656, max=0.656, sum=0.656 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 4.182326621923938,
          "description": "min=4.182, mean=4.182, max=4.182, sum=4.182 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.8085106382978723,
          "description": "min=0.809, mean=0.809, max=0.809, sum=0.809 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.9377049180327869,
          "description": "min=0.938, mean=0.938, max=0.938, sum=0.938 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 4.297000000000006,
          "description": "min=4.297, mean=4.297, max=4.297, sum=4.297 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "dischargeme:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 4.673148148148149,
          "description": "min=4.673, mean=4.673, max=4.673, sum=4.673 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "aci_bench:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 4.230902777777779,
          "description": "min=4.231, mean=4.231, max=4.231, sum=4.231 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 4.610499999999993,
          "description": "min=4.61, mean=4.61, max=4.61, sum=4.61 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 3.9188888888888855,
          "description": "min=3.919, mean=3.919, max=3.919, sum=3.919 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 4.665297741273103,
          "description": "min=4.665, mean=4.665, max=4.665, sum=4.665 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 4.584744396065148,
          "description": "min=4.585, mean=4.585, max=4.585, sum=4.585 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 4.784856879039718,
          "description": "min=4.785, mean=4.785, max=4.785, sum=4.785 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 4.435555555555553,
          "description": "min=4.361, mean=4.436, max=4.51, sum=8.871 (2)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "med_dialog,subset=icliniq:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.739,
          "description": "min=0.739, mean=0.739, max=0.739, sum=0.739 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 4.757777777777781,
          "description": "min=4.758, mean=4.758, max=4.758, sum=4.758 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 4.950248756218905,
          "description": "min=4.95, mean=4.95, max=4.95, sum=4.95 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.7433333333333333,
          "description": "min=0.743, mean=0.743, max=0.743, sum=0.743 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.98,
          "description": "min=0.98, mean=0.98, max=0.98, sum=0.98 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.743,
          "description": "min=0.743, mean=0.743, max=0.743, sum=0.743 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.076,
          "description": "min=0.076, mean=0.076, max=0.076, sum=0.076 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.7772727272727272,
          "description": "min=0.777, mean=0.777, max=0.777, sum=0.777 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.9161676646706587,
          "description": "min=0.916, mean=0.916, max=0.916, sum=0.916 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "race_based_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.8643410852713179,
          "description": "min=0.779, mean=0.864, max=0.93, sum=2.593 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "n2c2_ct_matching:subject=ADVANCED-CAD,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "n2c2_ct_matching:subject=CREATININE,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.847,
          "description": "min=0.847, mean=0.847, max=0.847, sum=0.847 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.701,
          "description": "min=0.701, mean=0.701, max=0.701, sum=0.701 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.2989618858066198,
          "description": "min=0.299, mean=0.299, max=0.299, sum=0.299 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimiciv_billing_code:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.8960244648318043,
          "description": "min=0.896, mean=0.896, max=0.896, sum=0.896 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.583,
          "description": "min=0.583, mean=0.583, max=0.583, sum=0.583 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.568,
          "description": "min=0.568, mean=0.568, max=0.568, sum=0.568 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        }
      ],
      [
        {
          "value": "Gemini 1.5 Pro (001)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.24107142857142858,
          "markdown": false
        },
        {
          "value": 0.12,
          "description": "min=0.12, mean=0.12, max=0.12, sum=0.12 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.6095005938613985,
          "description": "min=0.222, mean=0.61, max=0.859, sum=7.924 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=chronic_pain,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=homelessness,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=liver_disease,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=major_depression,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=personality_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=unemployment,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 4.384595368201935,
          "description": "min=4.385, mean=4.385, max=4.385, sum=4.385 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.52428810720268,
          "description": "min=0.524, mean=0.524, max=0.524, sum=0.524 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.5,
          "description": "min=0.139, mean=0.5, max=0.864, sum=1.5 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "ehrshot:subject=lab_anemia,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "ehrshot:subject=new_hypertension,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.84,
          "description": "min=0.84, mean=0.84, max=0.84, sum=0.84 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.4772727272727273,
          "description": "min=0.477, mean=0.477, max=0.477, sum=0.477 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 3.884787472035795,
          "description": "min=3.885, mean=3.885, max=3.885, sum=3.885 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.7612293144208038,
          "description": "min=0.761, mean=0.761, max=0.761, sum=0.761 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.7311475409836066,
          "description": "min=0.731, mean=0.731, max=0.731, sum=0.731 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 3.619333333333337,
          "description": "min=3.619, mean=3.619, max=3.619, sum=3.619 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 4.493518518518516,
          "description": "min=4.494, mean=4.494, max=4.494, sum=4.494 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 2.8090277777777786,
          "description": "min=2.809, mean=2.809, max=2.809, sum=2.809 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 4.209722222222227,
          "description": "min=4.21, mean=4.21, max=4.21, sum=4.21 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 3.6188888888888884,
          "description": "min=3.619, mean=3.619, max=3.619, sum=3.619 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 4.906913073237509,
          "description": "min=4.907, mean=4.907, max=4.907, sum=4.907 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 3.727624576681176,
          "description": "min=3.728, mean=3.728, max=3.728, sum=3.728 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 3.5575561711295847,
          "description": "min=3.558, mean=3.558, max=3.558, sum=3.558 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 3.723499999999989,
          "description": "min=3.633, mean=3.723, max=3.814, sum=7.447 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "med_dialog,subset=icliniq:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.74,
          "description": "min=0.74, mean=0.74, max=0.74, sum=0.74 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 4.225925925925926,
          "description": "min=4.226, mean=4.226, max=4.226, sum=4.226 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 4.378109452736319,
          "description": "min=4.378, mean=4.378, max=4.378, sum=4.378 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.7133333333333334,
          "description": "min=0.713, mean=0.713, max=0.713, sum=0.713 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.9533333333333334,
          "description": "min=0.953, mean=0.953, max=0.953, sum=0.953 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.625,
          "description": "min=0.625, mean=0.625, max=0.625, sum=0.625 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.152,
          "description": "min=0.152, mean=0.152, max=0.152, sum=0.152 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.7818181818181819,
          "description": "min=0.782, mean=0.782, max=0.782, sum=0.782 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.8203592814371258,
          "description": "min=0.82, mean=0.82, max=0.82, sum=0.82 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.7906976744186046,
          "description": "min=0.709, mean=0.791, max=0.86, sum=2.372 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "n2c2_ct_matching:subject=CREATININE,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.891,
          "description": "min=0.891, mean=0.891, max=0.891, sum=0.891 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.745,
          "description": "min=0.745, mean=0.745, max=0.745, sum=0.745 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.18605294710805736,
          "description": "min=0.186, mean=0.186, max=0.186, sum=0.186 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimiciv_billing_code:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.8593272171253823,
          "description": "min=0.859, mean=0.859, max=0.859, sum=0.859 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.466,
          "description": "min=0.466, mean=0.466, max=0.466, sum=0.466 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.576,
          "description": "min=0.576, mean=0.576, max=0.576, sum=0.576 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        }
      ],
      [
        {
          "value": "Gemini 2.0 Flash",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.41964285714285715,
          "markdown": false
        },
        {
          "value": 0.158,
          "description": "min=0.158, mean=0.158, max=0.158, sum=0.158 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.6873491012925947,
          "description": "min=0.352, mean=0.687, max=0.87, sum=8.936 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=chronic_pain,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=homelessness,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=liver_disease,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=major_depression,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=personality_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=unemployment,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 4.116835805360402,
          "description": "min=4.117, mean=4.117, max=4.117, sum=4.117 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.5963149078726968,
          "description": "min=0.596, mean=0.596, max=0.596, sum=0.596 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.5026666666666667,
          "description": "min=0.137, mean=0.503, max=0.816, sum=1.508 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "ehrshot:subject=lab_anemia,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "ehrshot:subject=new_hypertension,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.88,
          "description": "min=0.88, mean=0.88, max=0.88, sum=0.88 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.6298701298701299,
          "description": "min=0.63, mean=0.63, max=0.63, sum=0.63 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 3.715883668903806,
          "description": "min=3.716, mean=3.716, max=3.716, sum=3.716 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.7281323877068558,
          "description": "min=0.728, mean=0.728, max=0.728, sum=0.728 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.9355191256830601,
          "description": "min=0.936, mean=0.936, max=0.936, sum=0.936 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 3.4546666666666686,
          "description": "min=3.455, mean=3.455, max=3.455, sum=3.455 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 4.612037037037038,
          "description": "min=4.612, mean=4.612, max=4.612, sum=4.612 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 3.641493055555557,
          "description": "min=3.641, mean=3.641, max=3.641, sum=3.641 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 4.248500000000007,
          "description": "min=4.249, mean=4.249, max=4.249, sum=4.249 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 3.8299999999999987,
          "description": "min=3.83, mean=3.83, max=3.83, sum=3.83 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 4.927104722792611,
          "description": "min=4.927, mean=4.927, max=4.927, sum=4.927 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 4.428801806160294,
          "description": "min=4.429, mean=4.429, max=4.429, sum=4.429 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 3.4361341951369657,
          "description": "min=3.436, mean=3.436, max=3.436, sum=3.436 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 4.224527777777782,
          "description": "min=4.178, mean=4.225, max=4.271, sum=8.449 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "med_dialog,subset=icliniq:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.75,
          "description": "min=0.75, mean=0.75, max=0.75, sum=0.75 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 4.767407407407409,
          "description": "min=4.767, mean=4.767, max=4.767, sum=4.767 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 4.439469320066335,
          "description": "min=4.439, mean=4.439, max=4.439, sum=4.439 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.7466666666666667,
          "description": "min=0.747, mean=0.747, max=0.747, sum=0.747 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.9066666666666666,
          "description": "min=0.907, mean=0.907, max=0.907, sum=0.907 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.64,
          "description": "min=0.64, mean=0.64, max=0.64, sum=0.64 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.165,
          "description": "min=0.165, mean=0.165, max=0.165, sum=0.165 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.7954545454545454,
          "description": "min=0.795, mean=0.795, max=0.795, sum=0.795 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.8562874251497006,
          "description": "min=0.856, mean=0.856, max=0.856, sum=0.856 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.7170542635658914,
          "description": "min=0.523, mean=0.717, max=0.93, sum=2.151 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "n2c2_ct_matching:subject=CREATININE,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.908,
          "description": "min=0.908, mean=0.908, max=0.908, sum=0.908 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.597,
          "description": "min=0.597, mean=0.597, max=0.597, sum=0.597 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.23228162127907237,
          "description": "min=0.232, mean=0.232, max=0.232, sum=0.232 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimiciv_billing_code:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.8990825688073395,
          "description": "min=0.899, mean=0.899, max=0.899, sum=0.899 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.566,
          "description": "min=0.566, mean=0.566, max=0.566, sum=0.566 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.505,
          "description": "min=0.505, mean=0.505, max=0.505, sum=0.505 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        }
      ],
      [
        {
          "value": "GPT-4o (2024-05-13)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.5696428571428571,
          "markdown": false
        },
        {
          "value": 0.188,
          "description": "min=0.188, mean=0.188, max=0.188, sum=0.188 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.6421195891654634,
          "description": "min=0.308, mean=0.642, max=0.84, sum=8.348 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=chronic_pain,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=homelessness,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=liver_disease,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=major_depression,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=personality_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=unemployment,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 4.291438979963575,
          "description": "min=4.291, mean=4.291, max=4.291, sum=4.291 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.5795644891122278,
          "description": "min=0.58, mean=0.58, max=0.58, sum=0.58 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.5413333333333333,
          "description": "min=0.404, mean=0.541, max=0.738, sum=1.624 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "ehrshot:subject=lab_anemia,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "ehrshot:subject=new_hypertension,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.906,
          "description": "min=0.906, mean=0.906, max=0.906, sum=0.906 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.711038961038961,
          "description": "min=0.711, mean=0.711, max=0.711, sum=0.711 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 3.844146159582401,
          "description": "min=3.844, mean=3.844, max=3.844, sum=3.844 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.8368794326241135,
          "description": "min=0.837, mean=0.837, max=0.837, sum=0.837 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.9453551912568307,
          "description": "min=0.945, mean=0.945, max=0.945, sum=0.945 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 3.577999999999998,
          "description": "min=3.578, mean=3.578, max=3.578, sum=3.578 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 4.512037037037037,
          "description": "min=4.512, mean=4.512, max=4.512, sum=4.512 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 3.9088541666666687,
          "description": "min=3.909, mean=3.909, max=3.909, sum=3.909 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 4.191888888888877,
          "description": "min=4.192, mean=4.192, max=4.192, sum=4.192 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 3.9699999999999993,
          "description": "min=3.97, mean=3.97, max=3.97, sum=3.97 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 4.954825462012319,
          "description": "min=4.955, mean=4.955, max=4.955, sum=4.955 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 4.4162231898080915,
          "description": "min=4.416, mean=4.416, max=4.416, sum=4.416 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 3.620190827947067,
          "description": "min=3.62, mean=3.62, max=3.62, sum=3.62 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 4.211361111111113,
          "description": "min=4.146, mean=4.211, max=4.277, sum=8.423 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "med_dialog,subset=icliniq:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.772,
          "description": "min=0.772, mean=0.772, max=0.772, sum=0.772 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 4.608888888888893,
          "description": "min=4.609, mean=4.609, max=4.609, sum=4.609 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 4.497512437810945,
          "description": "min=4.498, mean=4.498, max=4.498, sum=4.498 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.72,
          "description": "min=0.72, mean=0.72, max=0.72, sum=0.72 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.9833333333333333,
          "description": "min=0.983, mean=0.983, max=0.983, sum=0.983 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.701,
          "description": "min=0.701, mean=0.701, max=0.701, sum=0.701 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.32,
          "description": "min=0.32, mean=0.32, max=0.32, sum=0.32 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.740909090909091,
          "description": "min=0.741, mean=0.741, max=0.741, sum=0.741 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.8982035928143712,
          "description": "min=0.898, mean=0.898, max=0.898, sum=0.898 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.8294573643410853,
          "description": "min=0.744, mean=0.829, max=0.942, sum=2.488 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "n2c2_ct_matching:subject=CREATININE,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.846,
          "description": "min=0.846, mean=0.846, max=0.846, sum=0.846 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.656,
          "description": "min=0.656, mean=0.656, max=0.656, sum=0.656 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.34586736846836813,
          "description": "min=0.346, mean=0.346, max=0.346, sum=0.346 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimiciv_billing_code:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.8960244648318043,
          "description": "min=0.896, mean=0.896, max=0.896, sum=0.896 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.589,
          "description": "min=0.589, mean=0.589, max=0.589, sum=0.589 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.611,
          "description": "min=0.611, mean=0.611, max=0.611, sum=0.611 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        }
      ],
      [
        {
          "value": "GPT-4o mini (2024-07-18)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.39285714285714285,
          "markdown": false
        },
        {
          "value": 0.154,
          "description": "min=0.154, mean=0.154, max=0.154, sum=0.154 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.663374374081707,
          "description": "min=0.333, mean=0.663, max=0.88, sum=8.624 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=chronic_pain,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=homelessness,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=liver_disease,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=major_depression,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=personality_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=unemployment,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 4.24668227946917,
          "description": "min=4.247, mean=4.247, max=4.247, sum=4.247 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.5326633165829145,
          "description": "min=0.533, mean=0.533, max=0.533, sum=0.533 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.9066666666666666,
          "description": "min=0.858, mean=0.907, max=0.993, sum=2.72 (3)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "ehrshot:subject=lab_anemia,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "ehrshot:subject=new_hypertension,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.832,
          "description": "min=0.832, mean=0.832, max=0.832, sum=0.832 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.564935064935065,
          "description": "min=0.565, mean=0.565, max=0.565, sum=0.565 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 3.6655480984340043,
          "description": "min=3.666, mean=3.666, max=3.666, sum=3.666 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.806146572104019,
          "description": "min=0.806, mean=0.806, max=0.806, sum=0.806 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.7562841530054645,
          "description": "min=0.756, mean=0.756, max=0.756, sum=0.756 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 3.691666666666669,
          "description": "min=3.692, mean=3.692, max=3.692, sum=3.692 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 4.369444444444443,
          "description": "min=4.369, mean=4.369, max=4.369, sum=4.369 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 3.9157986111111156,
          "description": "min=3.916, mean=3.916, max=3.916, sum=3.916 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 4.250444444444436,
          "description": "min=4.25, mean=4.25, max=4.25, sum=4.25 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 3.7944444444444447,
          "description": "min=3.794, mean=3.794, max=3.794, sum=3.794 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 4.942048825005705,
          "description": "min=4.942, mean=4.942, max=4.942, sum=4.942 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 4.329624254152552,
          "description": "min=4.33, mean=4.33, max=4.33, sum=4.33 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 3.6140350877193055,
          "description": "min=3.614, mean=3.614, max=3.614, sum=3.614 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 4.145166666666659,
          "description": "min=4.127, mean=4.145, max=4.163, sum=8.29 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "med_dialog,subset=icliniq:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.738,
          "description": "min=0.738, mean=0.738, max=0.738, sum=0.738 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 4.56814814814815,
          "description": "min=4.568, mean=4.568, max=4.568, sum=4.568 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 4.540630182421227,
          "description": "min=4.541, mean=4.541, max=4.541, sum=4.541 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.7333333333333333,
          "description": "min=0.733, mean=0.733, max=0.733, sum=0.733 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.96,
          "description": "min=0.96, mean=0.96, max=0.96, sum=0.96 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.672,
          "description": "min=0.672, mean=0.672, max=0.672, sum=0.672 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.112,
          "description": "min=0.112, mean=0.112, max=0.112, sum=0.112 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.8363636363636363,
          "description": "min=0.836, mean=0.836, max=0.836, sum=0.836 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.7664670658682635,
          "description": "min=0.766, mean=0.766, max=0.766, sum=0.766 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.7829457364341085,
          "description": "min=0.733, mean=0.783, max=0.872, sum=2.349 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "n2c2_ct_matching:subject=CREATININE,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.801,
          "description": "min=0.801, mean=0.801, max=0.801, sum=0.801 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.708,
          "description": "min=0.708, mean=0.708, max=0.708, sum=0.708 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.263243296579599,
          "description": "min=0.263, mean=0.263, max=0.263, sum=0.263 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimiciv_billing_code:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.8990825688073395,
          "description": "min=0.899, mean=0.899, max=0.899, sum=0.899 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.579,
          "description": "min=0.579, mean=0.579, max=0.579, sum=0.579 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.594,
          "description": "min=0.594, mean=0.594, max=0.594, sum=0.594 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        }
      ],
      [
        {
          "value": "Llama 3.3 Instruct (70B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.30357142857142855,
          "markdown": false
        },
        {
          "value": 0.113,
          "description": "min=0.113, mean=0.113, max=0.113, sum=0.113 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.8130430111845777,
          "description": "min=0.609, mean=0.813, max=0.926, sum=10.57 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=chronic_pain,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=homelessness,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=liver_disease,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=major_depression,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=personality_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=unemployment,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 4.144418423106954,
          "description": "min=4.144, mean=4.144, max=4.144, sum=4.144 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.5293132328308208,
          "description": "min=0.529, mean=0.529, max=0.529, sum=0.529 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.6443333333333333,
          "description": "min=0.194, mean=0.644, max=0.897, sum=1.933 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "ehrshot:subject=lab_anemia,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "ehrshot:subject=new_hypertension,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.854,
          "description": "min=0.854, mean=0.854, max=0.854, sum=0.854 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.6071428571428571,
          "description": "min=0.607, mean=0.607, max=0.607, sum=0.607 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 3.66666666666667,
          "description": "min=3.667, mean=3.667, max=3.667, sum=3.667 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.640661938534279,
          "description": "min=0.641, mean=0.641, max=0.641, sum=0.641 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.9245901639344263,
          "description": "min=0.925, mean=0.925, max=0.925, sum=0.925 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 3.567000000000002,
          "description": "min=3.567, mean=3.567, max=3.567, sum=3.567 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 4.124074074074071,
          "description": "min=4.124, mean=4.124, max=4.124, sum=4.124 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 3.8376736111111147,
          "description": "min=3.838, mean=3.838, max=3.838, sum=3.838 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 4.058111111111116,
          "description": "min=4.058, mean=4.058, max=4.058, sum=4.058 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 3.7677777777777766,
          "description": "min=3.768, mean=3.768, max=3.768, sum=3.768 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 4.923796486424825,
          "description": "min=4.924, mean=4.924, max=4.924, sum=4.924 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 4.490566037735847,
          "description": "min=4.491, mean=4.491, max=4.491, sum=4.491 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 3.5866420437057647,
          "description": "min=3.587, mean=3.587, max=3.587, sum=3.587 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 4.086388888888882,
          "description": "min=4.039, mean=4.086, max=4.133, sum=8.173 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "med_dialog,subset=icliniq:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.789,
          "description": "min=0.789, mean=0.789, max=0.789, sum=0.789 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 4.674074074074076,
          "description": "min=4.674, mean=4.674, max=4.674, sum=4.674 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 4.228855721393035,
          "description": "min=4.229, mean=4.229, max=4.229, sum=4.229 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.6833333333333333,
          "description": "min=0.683, mean=0.683, max=0.683, sum=0.683 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.9766666666666667,
          "description": "min=0.977, mean=0.977, max=0.977, sum=0.977 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.733,
          "description": "min=0.733, mean=0.733, max=0.733, sum=0.733 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.074,
          "description": "min=0.074, mean=0.074, max=0.074, sum=0.074 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.8363636363636363,
          "description": "min=0.836, mean=0.836, max=0.836, sum=0.836 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.5688622754491018,
          "description": "min=0.569, mean=0.569, max=0.569, sum=0.569 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.813953488372093,
          "description": "min=0.744, mean=0.814, max=0.884, sum=2.442 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "n2c2_ct_matching:subject=CREATININE,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.882,
          "description": "min=0.882, mean=0.882, max=0.882, sum=0.882 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.768,
          "description": "min=0.768, mean=0.768, max=0.768, sum=0.768 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.19714220046154962,
          "description": "min=0.197, mean=0.197, max=0.197, sum=0.197 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimiciv_billing_code:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.7828746177370031,
          "description": "min=0.783, mean=0.783, max=0.783, sum=0.783 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.563,
          "description": "min=0.563, mean=0.563, max=0.563, sum=0.563 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.363,
          "description": "min=0.363, mean=0.363, max=0.363, sum=0.363 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        }
      ],
      [
        {
          "value": "o3-mini (2025-01-31)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.6410714285714286,
          "markdown": false
        },
        {
          "value": 0.34,
          "description": "min=0.34, mean=0.34, max=0.34, sum=0.34 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.8305836653607831,
          "description": "min=0.707, mean=0.831, max=0.94, sum=10.798 (13)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=bipolar_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=chronic_pain,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=homelessness,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=liver_disease,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=major_depression,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=personality_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=substance_use_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=suicidal_behavior,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=tobacco_dependence,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=unemployment,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 4.603695029924544,
          "description": "min=4.604, mean=4.604, max=4.604, sum=4.604 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.6867671691792295,
          "description": "min=0.687, mean=0.687, max=0.687, sum=0.687 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "medec:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.642,
          "description": "min=0.342, mean=0.642, max=0.849, sum=1.926 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "ehrshot:subject=lab_anemia,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "ehrshot:subject=new_hypertension,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.893,
          "description": "min=0.893, mean=0.893, max=0.893, sum=0.893 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.8116883116883117,
          "description": "min=0.812, mean=0.812, max=0.812, sum=0.812 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "medbullets:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 4.005219985085757,
          "description": "min=4.005, mean=4.005, max=4.005, sum=4.005 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.7990543735224587,
          "description": "min=0.799, mean=0.799, max=0.799, sum=0.799 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.9256830601092896,
          "description": "min=0.926, mean=0.926, max=0.926, sum=0.926 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 4.293000000000009,
          "description": "min=4.293, mean=4.293, max=4.293, sum=4.293 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 4.577777777777778,
          "description": "min=4.578, mean=4.578, max=4.578, sum=4.578 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 4.274305555555555,
          "description": "min=4.274, mean=4.274, max=4.274, sum=4.274 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 4.358888888888886,
          "description": "min=4.359, mean=4.359, max=4.359, sum=4.359 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 3.846666666666666,
          "description": "min=3.847, mean=3.847, max=3.847, sum=3.847 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 4.940908053844398,
          "description": "min=4.941, mean=4.941, max=4.941, sum=4.941 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 4.345266892436706,
          "description": "min=4.345, mean=4.345, max=4.345, sum=4.345 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 4.5807940904893885,
          "description": "min=4.581, mean=4.581, max=4.581, sum=4.581 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 4.227944444444448,
          "description": "min=4.194, mean=4.228, max=4.262, sum=8.456 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "med_dialog,subset=icliniq:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.713,
          "description": "min=0.713, mean=0.713, max=0.713, sum=0.713 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 4.56962962962963,
          "description": "min=4.57, mean=4.57, max=4.57, sum=4.57 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 4.933665008291875,
          "description": "min=4.934, mean=4.934, max=4.934, sum=4.934 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.6966666666666667,
          "description": "min=0.697, mean=0.697, max=0.697, sum=0.697 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.9166666666666666,
          "description": "min=0.917, mean=0.917, max=0.917, sum=0.917 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.743,
          "description": "min=0.743, mean=0.743, max=0.743, sum=0.743 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.272,
          "description": "min=0.272, mean=0.272, max=0.272, sum=0.272 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.8045454545454546,
          "description": "min=0.805, mean=0.805, max=0.805, sum=0.805 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.874251497005988,
          "description": "min=0.874, mean=0.874, max=0.874, sum=0.874 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.8953488372093023,
          "description": "min=0.849, mean=0.895, max=0.93, sum=2.686 (3)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "n2c2_ct_matching:subject=ADVANCED-CAD,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "n2c2_ct_matching:subject=CREATININE,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.896,
          "description": "min=0.896, mean=0.896, max=0.896, sum=0.896 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.784,
          "description": "min=0.784, mean=0.784, max=0.784, sum=0.784 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.24289418050953807,
          "description": "min=0.243, mean=0.243, max=0.243, sum=0.243 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimiciv_billing_code:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.8960244648318043,
          "description": "min=0.896, mean=0.896, max=0.896, sum=0.896 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.559,
          "description": "min=0.559, mean=0.559, max=0.559, sum=0.559 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.546,
          "description": "min=0.546, mean=0.546, max=0.546, sum=0.546 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        }
      ]
    ],
    "links": [
      {
        "text": "LaTeX",
        "href": "benchmark_output/releases/v2.0.0/groups/latex/medhelm_scenarios_accuracy.tex"
      },
      {
        "text": "JSON",
        "href": "benchmark_output/releases/v2.0.0/groups/json/medhelm_scenarios_accuracy.json"
      }
    ],
    "name": "accuracy"
  },
  {
    "title": "Efficiency",
    "header": [
      {
        "value": "Model",
        "markdown": false,
        "metadata": {}
      },
      {
        "value": "Mean win rate",
        "description": "How many models this model outperforms on average (over columns).",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {}
      },
      {
        "value": "MedCalc-Bench - Observed inference time (s)",
        "description": "A dataset which consists of a patient note, a question requesting to compute a specific medical value, and a ground truth answer [(Khandekar et al., 2024)](https://arxiv.org/abs/2406.12036).\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "MedCalc-Bench"
        }
      },
      {
        "value": "CLEAR - Observed inference time (s)",
        "description": "A dataset for evaluating the presence of a specific medical condition from patient notes with yes/no/maybe classifications [(Lopez et al., 2025)](https://www.nature.com/articles/s41746-024-01377-1).\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "CLEAR"
        }
      },
      {
        "value": "MTSamples - Observed inference time (s)",
        "description": "A dataset of clinical notes where the model is prompted to generate the appropriate treatment plan for this patient [(MTSamples, 2025)](https://mtsamples.com).\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "MTSamples"
        }
      },
      {
        "value": "Medec - Observed inference time (s)",
        "description": "A dataset containing medical narratives with error detection and correction pairs [(Abacha et al., 2025)](https://arxiv.org/abs/2412.19260).\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "Medec"
        }
      },
      {
        "value": "EHRSHOT - Observed inference time (s)",
        "description": "A dataset given a patient record of EHR codes, classifying if an event will occur at a future date or not [(Wornow et al., 2023)](https://arxiv.org/abs/2307.02028).\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "EHRSHOT"
        }
      },
      {
        "value": "HeadQA - Observed inference time (s)",
        "description": "A collection of biomedical multiple-choice questions for testing medical knowledge [(Vilares et al., 2019)](https://arxiv.org/abs/1906.04701).\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "HeadQA"
        }
      },
      {
        "value": "Medbullets - Observed inference time (s)",
        "description": "A USMLE-style medical question dataset with multiple-choice answers and explanations [(MedBullets, 2025)](https://step2.medbullets.com).\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "Medbullets"
        }
      },
      {
        "value": "MedAlign - Observed inference time (s)",
        "description": "A dataset that asks models to answer questions/follow instructions over longitudinal EHR [(Fleming et al., 2023)](https://arxiv.org/abs/2308.14089).\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "MedAlign"
        }
      },
      {
        "value": "ADHD-Behavior - Observed inference time (s)",
        "description": "A dataset that classifies whether a clinical note contains a clinician recommendation for parent training in behavior management, which is the first-line evidence-based treatment for young children with ADHD [(Pillai et al., 2024)](https://doi.org/10.1093/jamia/ocae001).\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "ADHD-Behavior"
        }
      },
      {
        "value": "ADHD-MedEffects - Observed inference time (s)",
        "description": "A dataset that classifies whether a clinical note contains documentation of side effect monitoring (recording of absence or presence of medication side effects), as recommended in clinical practice guidelines [(Bannet et al., 2024)](https://doi.org/10.1542/peds.2024-067223).\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "ADHD-MedEffects"
        }
      },
      {
        "value": "DischargeMe - Observed inference time (s)",
        "description": "DischargeMe is a discharge instruction generation dataset and brief hospital course generation dataset collected from MIMIC-IV data, considering only the discharge text as well as the radiology report text [(Xu, 2024)](https://physionet.org/content/discharge-me/1.3/).\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "DischargeMe"
        }
      },
      {
        "value": "ACI-Bench - Observed inference time (s)",
        "description": "A dataset of patient-doctor conversations paired with structured clinical notes [(Yim et al., 2024)](https://www.nature.com/articles/s41597-023-02487-3).\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "ACI-Bench"
        }
      },
      {
        "value": "MTSamples Procedures - Observed inference time (s)",
        "description": "A dataset that provides a patient note regarding an operation, with the objective to document the procedure.\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "MTSamples Procedures"
        }
      },
      {
        "value": "MIMIC-RRS - Observed inference time (s)",
        "description": "A dataset containing radiology reports with findings sections from MIMIC-III paired with their corresponding impression sections, used for generating radiology report summaries [(Chen et al., 2023)](https://arxiv.org/abs/2211.08584).\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "MIMIC-RRS"
        }
      },
      {
        "value": "MIMIC-BHC - Observed inference time (s)",
        "description": "A summarization task using a curated collection of preprocessed discharge notes paired with their corresponding brief hospital course (BHC) summaries [(Aali et al., 2024)](https://doi.org/10.1093/jamia/ocae312).\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "MIMIC-BHC"
        }
      },
      {
        "value": "NoteExtract - Observed inference time (s)",
        "description": "A dataset containing free form text of a clinical health worker care plan, with the associated goal being to restructure that text into a given format.\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "NoteExtract"
        }
      },
      {
        "value": "MedicationQA - Observed inference time (s)",
        "description": "Consumer medication questions with reference answers.\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "MedicationQA"
        }
      },
      {
        "value": "PatientInstruct - Observed inference time (s)",
        "description": "A dataset containing case details used to generate customized post-procedure patient instructions.\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "PatientInstruct"
        }
      },
      {
        "value": "MedDialog - Observed inference time (s)",
        "description": "A collection of doctor-patient conversations with corresponding summaries.\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "MedDialog"
        }
      },
      {
        "value": "MedConfInfo - Observed inference time (s)",
        "description": "A dataset of clinical notes from adolescent patients used to identify sensitive protected health information that should be restricted from parental access [(Rabbani et al., 2024)](https://jamanetwork.com/journals/jamapediatrics/fullarticle/2814109).\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "MedConfInfo"
        }
      },
      {
        "value": "MEDIQA - Observed inference time (s)",
        "description": "A dataset including a medical question, a set of candidate answers, relevance annotations for ranking, and additional context to evaluate understanding and retrieval capabilities in a healthcare setting.\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "MEDIQA"
        }
      },
      {
        "value": "MentalHealth - Observed inference time (s)",
        "description": "A dataset containing a counselor and mental health patient conversation, where the objective is to generate an empathetic counselor response.\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "MentalHealth"
        }
      },
      {
        "value": "ProxySender - Observed inference time (s)",
        "description": "This dataset features messages sent by proxy users and non proxy users, for evaluation of LLM capabilities to determine the sender [(Tse G, et al., 2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "ProxySender"
        }
      },
      {
        "value": "PrivacyDetection - Observed inference time (s)",
        "description": "A dataset that determines if a message leaks any confidential information from the patient [(Tse G, et al., 2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "PrivacyDetection"
        }
      },
      {
        "value": "PubMedQA - Observed inference time (s)",
        "description": "A dataset that provides pubmed abstracts and asks associated questions yes/no/maybe questions.\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "PubMedQA"
        }
      },
      {
        "value": "EHRSQL - Observed inference time (s)",
        "description": "Given a natural language instruction, generate an SQL query that would be used in clinical research.\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "EHRSQL"
        }
      },
      {
        "value": "BMT-Status - Observed inference time (s)",
        "description": "A dataset containing patient notes with associated questions and answers related to bone marrow transplantation.\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "BMT-Status"
        }
      },
      {
        "value": "RaceBias - Observed inference time (s)",
        "description": "A collection of LLM outputs in response to medical questions with race-based biases, with the objective being to classify whether the output contains racially biased content.\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "RaceBias"
        }
      },
      {
        "value": "N2C2-CT - Observed inference time (s)",
        "description": "A dataset that provides clinical notes and asks the model to classify whether the patient is a valid candidate for a provided clinical trial.\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "N2C2-CT"
        }
      },
      {
        "value": "MedHallu - Observed inference time (s)",
        "description": "A dataset of PubMed articles and associated questions, with the objective being to classify whether the answer is factual or hallucinated.\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "MedHallu"
        }
      },
      {
        "value": "HospiceReferral - Observed inference time (s)",
        "description": "A dataset evaluating performance in identifying appropriate patient referrals to hospice care.\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "HospiceReferral"
        }
      },
      {
        "value": "MIMIC-IV Billing Code - Observed inference time (s)",
        "description": "A  dataset pairing clinical notes from MIMIC-IV with corresponding ICD-10 billing codes.\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "MIMIC-IV Billing Code"
        }
      },
      {
        "value": "ClinicReferral - Observed inference time (s)",
        "description": "A dataset containing manually curated answers to questions regarding patient referrals to the Sequoia clinic.\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "ClinicReferral"
        }
      },
      {
        "value": "CDI-QA - Observed inference time (s)",
        "description": "A dataset built from Clinical Document Integrity (CDI) notes, to assess the ability to answer verification questions from previous notes.\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "CDI-QA"
        }
      },
      {
        "value": "ENT-Referral - Observed inference time (s)",
        "description": "A dataset designed to evaluate performance in identifying appropriate patient referrals to Ear, Nose, and Throat specialists.\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "ENT-Referral"
        }
      }
    ],
    "rows": [
      [
        {
          "value": "Claude 3.5 Sonnet (20241022)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.43014705882352944,
          "markdown": false
        },
        {
          "value": 1.3506605696678162,
          "description": "min=1.351, mean=1.351, max=1.351, sum=1.351 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1.707505668594481,
          "description": "min=1.672, mean=1.708, max=1.764, sum=22.198 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=chronic_pain,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=homelessness,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=liver_disease,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=major_depression,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=personality_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=unemployment,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 10.976841152133092,
          "description": "min=10.977, mean=10.977, max=10.977, sum=10.977 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 3.558966633462826,
          "description": "min=3.559, mean=3.559, max=3.559, sum=3.559 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 7.37325446120898,
          "description": "min=6.852, mean=7.373, max=7.856, sum=22.12 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "ehrshot:subject=lab_anemia,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "ehrshot:subject=new_hypertension,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1.3471950261592864,
          "description": "min=1.347, mean=1.347, max=1.347, sum=1.347 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1.3105680098781338,
          "description": "min=1.311, mean=1.311, max=1.311, sum=1.311 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 15.022760760864156,
          "description": "min=15.023, mean=15.023, max=15.023, sum=15.023 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1.6249394027899342,
          "description": "min=1.625, mean=1.625, max=1.625, sum=1.625 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1.794606918585105,
          "description": "min=1.795, mean=1.795, max=1.795, sum=1.795 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 9.33179973578453,
          "description": "min=9.332, mean=9.332, max=9.332, sum=9.332 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 11.376140429576237,
          "description": "min=11.376, mean=11.376, max=11.376, sum=11.376 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 10.682079760357738,
          "description": "min=10.682, mean=10.682, max=10.682, sum=10.682 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 3.1525755443573,
          "description": "min=3.153, mean=3.153, max=3.153, sum=3.153 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 5.536085119247437,
          "description": "min=5.536, mean=5.536, max=5.536, sum=5.536 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 8.081205949646247,
          "description": "min=8.081, mean=8.081, max=8.081, sum=8.081 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 6.5651008826727795,
          "description": "min=6.565, mean=6.565, max=6.565, sum=6.565 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 7.503762823723029,
          "description": "min=7.504, mean=7.504, max=7.504, sum=7.504 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 2.7432067596912386,
          "description": "min=2.721, mean=2.743, max=2.766, sum=5.486 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "med_dialog,subset=icliniq:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 2.208379147768021,
          "description": "min=2.208, mean=2.208, max=2.208, sum=2.208 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 9.511107142766317,
          "description": "min=9.511, mean=9.511, max=9.511, sum=9.511 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 6.4161380796290155,
          "description": "min=6.416, mean=6.416, max=6.416, sum=6.416 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1.543018540541331,
          "description": "min=1.543, mean=1.543, max=1.543, sum=1.543 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1.6228671113650004,
          "description": "min=1.623, mean=1.623, max=1.623, sum=1.623 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1.489382393360138,
          "description": "min=1.489, mean=1.489, max=1.489, sum=1.489 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 4.226262514407818,
          "description": "min=4.226, mean=4.226, max=4.226, sum=4.226 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 2.24667038267309,
          "description": "min=2.247, mean=2.247, max=2.247, sum=2.247 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1.2453928564836878,
          "description": "min=1.245, mean=1.245, max=1.245, sum=1.245 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 3.34105330197386,
          "description": "min=3.265, mean=3.341, max=3.381, sum=10.023 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "n2c2_ct_matching:subject=CREATININE,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 2.388364407300949,
          "description": "min=2.388, mean=2.388, max=2.388, sum=2.388 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 5.993295238733292,
          "description": "min=5.993, mean=5.993, max=5.993, sum=5.993 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "value": 9.196969989607458,
          "description": "min=9.197, mean=9.197, max=9.197, sum=9.197 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 13.668995621919631,
          "description": "min=13.669, mean=13.669, max=13.669, sum=13.669 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 4.761423567295075,
          "description": "min=4.761, mean=4.761, max=4.761, sum=4.761 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        }
      ],
      [
        {
          "value": "Claude 3.7 Sonnet (20250219)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.34558823529411764,
          "markdown": false
        },
        {
          "value": 3.863195901632309,
          "description": "min=3.863, mean=3.863, max=3.863, sum=3.863 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1.5894952763282189,
          "description": "min=1.541, mean=1.589, max=1.627, sum=20.663 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=chronic_pain,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=homelessness,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=liver_disease,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=major_depression,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=personality_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=unemployment,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 13.848624661599723,
          "description": "min=13.849, mean=13.849, max=13.849, sum=13.849 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1.4292833493582566,
          "description": "min=1.429, mean=1.429, max=1.429, sum=1.429 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 7.860006173849105,
          "description": "min=7.014, mean=7.86, max=8.915, sum=23.58 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "ehrshot:subject=lab_anemia,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "ehrshot:subject=new_hypertension,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1.3447579393386841,
          "description": "min=1.345, mean=1.345, max=1.345, sum=1.345 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1.252411273392764,
          "description": "min=1.252, mean=1.252, max=1.252, sum=1.252 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 18.35834754873442,
          "description": "min=18.358, mean=18.358, max=18.358, sum=18.358 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1.6512611312505483,
          "description": "min=1.651, mean=1.651, max=1.651, sum=1.651 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1.7997335014447489,
          "description": "min=1.8, mean=1.8, max=1.8, sum=1.8 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 9.756311689853668,
          "description": "min=9.756, mean=9.756, max=9.756, sum=9.756 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 13.21376874645551,
          "description": "min=13.214, mean=13.214, max=13.214, sum=13.214 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 12.927648959681392,
          "description": "min=12.928, mean=12.928, max=12.928, sum=12.928 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 3.2667968525886537,
          "description": "min=3.267, mean=3.267, max=3.267, sum=3.267 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 6.3847999048233035,
          "description": "min=6.385, mean=6.385, max=6.385, sum=6.385 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 5.383256822640891,
          "description": "min=5.383, mean=5.383, max=5.383, sum=5.383 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 8.33733369269461,
          "description": "min=8.337, mean=8.337, max=8.337, sum=8.337 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 8.463370934087484,
          "description": "min=8.463, mean=8.463, max=8.463, sum=8.463 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 3.1681859172582625,
          "description": "min=3.022, mean=3.168, max=3.314, sum=6.336 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "med_dialog,subset=icliniq:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 2.0782204871177674,
          "description": "min=2.078, mean=2.078, max=2.078, sum=2.078 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 11.665022039413453,
          "description": "min=11.665, mean=11.665, max=11.665, sum=11.665 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 7.703489143456986,
          "description": "min=7.703, mean=7.703, max=7.703, sum=7.703 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1.765497265656789,
          "description": "min=1.765, mean=1.765, max=1.765, sum=1.765 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 2.0104193170865376,
          "description": "min=2.01, mean=2.01, max=2.01, sum=2.01 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 2.4872676334381105,
          "description": "min=2.487, mean=2.487, max=2.487, sum=2.487 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 8.11839797091484,
          "description": "min=8.118, mean=8.118, max=8.118, sum=8.118 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 2.2075942852280357,
          "description": "min=2.208, mean=2.208, max=2.208, sum=2.208 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 2.5685647547601937,
          "description": "min=2.569, mean=2.569, max=2.569, sum=2.569 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 3.0745366978090867,
          "description": "min=3.06, mean=3.075, max=3.085, sum=9.224 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "n2c2_ct_matching:subject=CREATININE,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 4.534155822515488,
          "description": "min=4.534, mean=4.534, max=4.534, sum=4.534 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 6.536557952165603,
          "description": "min=6.537, mean=6.537, max=6.537, sum=6.537 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "value": 10.019044827248344,
          "description": "min=10.019, mean=10.019, max=10.019, sum=10.019 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 16.081016157627104,
          "description": "min=16.081, mean=16.081, max=16.081, sum=16.081 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 4.905725754976273,
          "description": "min=4.906, mean=4.906, max=4.906, sum=4.906 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        }
      ],
      [
        {
          "value": "DeepSeek R1",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.01838235294117647,
          "markdown": false
        },
        {
          "value": 43.75286227345467,
          "description": "min=43.753, mean=43.753, max=43.753, sum=43.753 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 5.571368995212822,
          "description": "min=3.961, mean=5.571, max=7.31, sum=72.428 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=bipolar_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=chronic_pain,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=homelessness,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=liver_disease,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=major_depression,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=personality_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=substance_use_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=suicidal_behavior,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=tobacco_dependence,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=unemployment,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 29.774344934014714,
          "description": "min=29.774, mean=29.774, max=29.774, sum=29.774 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 41.87717197728117,
          "description": "min=41.877, mean=41.877, max=41.877, sum=41.877 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 32.238988775283346,
          "description": "min=29.453, mean=32.239, max=34.805, sum=96.717 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "ehrshot:subject=lab_anemia,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "ehrshot:subject=new_hypertension,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 20.78036990451813,
          "description": "min=20.78, mean=20.78, max=20.78, sum=20.78 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 34.088611539308125,
          "description": "min=34.089, mean=34.089, max=34.089, sum=34.089 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 55.19278714320804,
          "description": "min=55.193, mean=55.193, max=55.193, sum=55.193 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 3.7312907789334058,
          "description": "min=3.731, mean=3.731, max=3.731, sum=3.731 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 4.9966402445110765,
          "description": "min=4.997, mean=4.997, max=4.997, sum=4.997 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 14.265671773433684,
          "description": "min=14.266, mean=14.266, max=14.266, sum=14.266 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 15.778188250462215,
          "description": "min=15.778, mean=15.778, max=15.778, sum=15.778 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 21.84207220003009,
          "description": "min=21.842, mean=21.842, max=21.842, sum=21.842 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 12.422235173121615,
          "description": "min=12.422, mean=12.422, max=12.422, sum=12.422 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 8.098028691128047,
          "description": "min=8.098, mean=8.098, max=8.098, sum=8.098 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 29.56650953713873,
          "description": "min=29.567, mean=29.567, max=29.567, sum=29.567 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 21.51520863188521,
          "description": "min=21.515, mean=21.515, max=21.515, sum=21.515 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 21.370546659274115,
          "description": "min=21.371, mean=21.371, max=21.371, sum=21.371 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 8.633845594762542,
          "description": "min=7.447, mean=8.634, max=9.82, sum=17.268 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "med_dialog,subset=icliniq:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 4.726435486793518,
          "description": "min=4.726, mean=4.726, max=4.726, sum=4.726 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 21.951781096458436,
          "description": "min=21.952, mean=21.952, max=21.952, sum=21.952 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 16.309963222759873,
          "description": "min=16.31, mean=16.31, max=16.31, sum=16.31 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 5.393800741036733,
          "description": "min=5.394, mean=5.394, max=5.394, sum=5.394 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 3.011624131202698,
          "description": "min=3.012, mean=3.012, max=3.012, sum=3.012 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 8.807464688062668,
          "description": "min=8.807, mean=8.807, max=8.807, sum=8.807 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 19.193029651880263,
          "description": "min=19.193, mean=19.193, max=19.193, sum=19.193 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 4.999103357575157,
          "description": "min=4.999, mean=4.999, max=4.999, sum=4.999 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 7.5093938205056565,
          "description": "min=7.509, mean=7.509, max=7.509, sum=7.509 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 15.904167234127527,
          "description": "min=11.84, mean=15.904, max=23.456, sum=47.713 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "n2c2_ct_matching:subject=ADVANCED-CAD,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "n2c2_ct_matching:subject=CREATININE,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 6.629145324707031,
          "description": "min=6.629, mean=6.629, max=6.629, sum=6.629 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 14.038371870109627,
          "description": "min=14.038, mean=14.038, max=14.038, sum=14.038 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "value": 18.646856945224492,
          "description": "min=18.647, mean=18.647, max=18.647, sum=18.647 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 31.498115318775177,
          "description": "min=31.498, mean=31.498, max=31.498, sum=31.498 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 8.433589814662934,
          "description": "min=8.434, mean=8.434, max=8.434, sum=8.434 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        }
      ],
      [
        {
          "value": "Gemini 1.5 Pro (001)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.5073529411764706,
          "markdown": false
        },
        {
          "value": 0.5846519210338592,
          "description": "min=0.585, mean=0.585, max=0.585, sum=0.585 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 2.112702348308752,
          "description": "min=2.039, mean=2.113, max=2.412, sum=27.465 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=chronic_pain,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=homelessness,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=liver_disease,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=major_depression,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=personality_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=unemployment,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 13.974278546049668,
          "description": "min=13.974, mean=13.974, max=13.974, sum=13.974 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.9463922046936137,
          "description": "min=0.946, mean=0.946, max=0.946, sum=0.946 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 4.764238543669383,
          "description": "min=4.487, mean=4.764, max=5.091, sum=14.293 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "ehrshot:subject=lab_anemia,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "ehrshot:subject=new_hypertension,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.7839107191562653,
          "description": "min=0.784, mean=0.784, max=0.784, sum=0.784 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.5115113142248872,
          "description": "min=0.512, mean=0.512, max=0.512, sum=0.512 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 10.494709195706669,
          "description": "min=10.495, mean=10.495, max=10.495, sum=10.495 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1.9810318197200767,
          "description": "min=1.981, mean=1.981, max=1.981, sum=1.981 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1.9988630597057238,
          "description": "min=1.999, mean=1.999, max=1.999, sum=1.999 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 7.394040625572204,
          "description": "min=7.394, mean=7.394, max=7.394, sum=7.394 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 11.991099919875463,
          "description": "min=11.991, mean=11.991, max=11.991, sum=11.991 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 7.07927854731679,
          "description": "min=7.079, mean=7.079, max=7.079, sum=7.079 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 3.4135646600723266,
          "description": "min=3.414, mean=3.414, max=3.414, sum=3.414 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 3.7724135589599608,
          "description": "min=3.772, mean=3.772, max=3.772, sum=3.772 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 4.6515466530465,
          "description": "min=4.652, mean=4.652, max=4.652, sum=4.652 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 6.1594320007955385,
          "description": "min=6.159, mean=6.159, max=6.159, sum=6.159 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 7.228144612668954,
          "description": "min=7.228, mean=7.228, max=7.228, sum=7.228 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 2.9894730966091156,
          "description": "min=2.966, mean=2.989, max=3.013, sum=5.979 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "med_dialog,subset=icliniq:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 2.17378666639328,
          "description": "min=2.174, mean=2.174, max=2.174, sum=2.174 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 8.410743509928386,
          "description": "min=8.411, mean=8.411, max=8.411, sum=8.411 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 3.906367454955827,
          "description": "min=3.906, mean=3.906, max=3.906, sum=3.906 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 2.3673279166221617,
          "description": "min=2.367, mean=2.367, max=2.367, sum=2.367 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 2.226118216514587,
          "description": "min=2.226, mean=2.226, max=2.226, sum=2.226 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.5072131760120392,
          "description": "min=0.507, mean=0.507, max=0.507, sum=0.507 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1.806319487810135,
          "description": "min=1.806, mean=1.806, max=1.806, sum=1.806 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 2.147076643596996,
          "description": "min=2.147, mean=2.147, max=2.147, sum=2.147 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.5023995673585081,
          "description": "min=0.502, mean=0.502, max=0.502, sum=0.502 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 2.7554199520007585,
          "description": "min=2.722, mean=2.755, max=2.792, sum=8.266 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "n2c2_ct_matching:subject=CREATININE,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.5319631929397584,
          "description": "min=0.532, mean=0.532, max=0.532, sum=0.532 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 6.904875324964523,
          "description": "min=6.905, mean=6.905, max=6.905, sum=6.905 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "value": 5.151954744204834,
          "description": "min=5.152, mean=5.152, max=5.152, sum=5.152 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 14.265551701307297,
          "description": "min=14.266, mean=14.266, max=14.266, sum=14.266 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 3.5295421197414396,
          "description": "min=3.53, mean=3.53, max=3.53, sum=3.53 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        }
      ],
      [
        {
          "value": "Gemini 2.0 Flash",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.7647058823529411,
          "markdown": false
        },
        {
          "value": 0.38494687390327453,
          "description": "min=0.385, mean=0.385, max=0.385, sum=0.385 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 1.916397778879627,
          "description": "min=1.878, mean=1.916, max=2.008, sum=24.913 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=chronic_pain,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=homelessness,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=liver_disease,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=major_depression,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=personality_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=unemployment,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 5.227254615176198,
          "description": "min=5.227, mean=5.227, max=5.227, sum=5.227 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.4693439094664863,
          "description": "min=0.469, mean=0.469, max=0.469, sum=0.469 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 3.30513532336553,
          "description": "min=3.199, mean=3.305, max=3.479, sum=9.915 (3)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "ehrshot:subject=lab_anemia,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "ehrshot:subject=new_hypertension,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.33297129392623903,
          "description": "min=0.333, mean=0.333, max=0.333, sum=0.333 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.33868198038695696,
          "description": "min=0.339, mean=0.339, max=0.339, sum=0.339 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 5.251175875631755,
          "description": "min=5.251, mean=5.251, max=5.251, sum=5.251 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "medalign:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 1.8757229600955971,
          "description": "min=1.876, mean=1.876, max=1.876, sum=1.876 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 1.877303554190964,
          "description": "min=1.877, mean=1.877, max=1.877, sum=1.877 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 3.683770047426224,
          "description": "min=3.684, mean=3.684, max=3.684, sum=3.684 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 4.9915327548980715,
          "description": "min=4.992, mean=4.992, max=4.992, sum=4.992 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 5.109029924497008,
          "description": "min=5.109, mean=5.109, max=5.109, sum=5.109 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 2.4756844749450684,
          "description": "min=2.476, mean=2.476, max=2.476, sum=2.476 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 2.571005198955536,
          "description": "min=2.571, mean=2.571, max=2.571, sum=2.571 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 3.346397649580938,
          "description": "min=3.346, mean=3.346, max=3.346, sum=3.346 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 3.843229794190826,
          "description": "min=3.843, mean=3.843, max=3.843, sum=3.843 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 3.8159013593626154,
          "description": "min=3.816, mean=3.816, max=3.816, sum=3.816 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 2.359554039001465,
          "description": "min=2.326, mean=2.36, max=2.393, sum=4.719 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "med_dialog,subset=icliniq:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 2.023589251279831,
          "description": "min=2.024, mean=2.024, max=2.024, sum=2.024 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 5.9853372367223105,
          "description": "min=5.985, mean=5.985, max=5.985, sum=5.985 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 2.684903842299732,
          "description": "min=2.685, mean=2.685, max=2.685, sum=2.685 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 2.35461799065272,
          "description": "min=2.355, mean=2.355, max=2.355, sum=2.355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 2.3275622653961183,
          "description": "min=2.328, mean=2.328, max=2.328, sum=2.328 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.3441569275856018,
          "description": "min=0.344, mean=0.344, max=0.344, sum=0.344 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.7652456421852112,
          "description": "min=0.765, mean=0.765, max=0.765, sum=0.765 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 1.9761667815121737,
          "description": "min=1.976, mean=1.976, max=1.976, sum=1.976 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.40159591086610347,
          "description": "min=0.402, mean=0.402, max=0.402, sum=0.402 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 2.5136772468108544,
          "description": "min=2.481, mean=2.514, max=2.551, sum=7.541 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "n2c2_ct_matching:subject=CREATININE,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.33655450963974,
          "description": "min=0.337, mean=0.337, max=0.337, sum=0.337 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 3.136566119670868,
          "description": "min=3.137, mean=3.137, max=3.137, sum=3.137 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "value": 3.85423586055044,
          "description": "min=3.854, mean=3.854, max=3.854, sum=3.854 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 4.905799955368042,
          "description": "min=4.906, mean=4.906, max=4.906, sum=4.906 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 2.7608540432453155,
          "description": "min=2.761, mean=2.761, max=2.761, sum=2.761 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        }
      ],
      [
        {
          "value": "GPT-4o (2024-05-13)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.75,
          "markdown": false
        },
        {
          "value": 1.2598307481160513,
          "description": "min=1.26, mean=1.26, max=1.26, sum=1.26 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.4909083424839861,
          "description": "min=0.334, mean=0.491, max=1.079, sum=6.382 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=chronic_pain,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=homelessness,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=liver_disease,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=major_depression,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=personality_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=unemployment,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 4.362946756587309,
          "description": "min=4.363, mean=4.363, max=4.363, sum=4.363 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.2239204533840886,
          "description": "min=1.224, mean=1.224, max=1.224, sum=1.224 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 29.547294229984285,
          "description": "min=27.229, mean=29.547, max=31.257, sum=88.642 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "ehrshot:subject=lab_anemia,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "ehrshot:subject=new_hypertension,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.36129777812957764,
          "description": "min=0.361, mean=0.361, max=0.361, sum=0.361 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.3442080895622055,
          "description": "min=0.344, mean=0.344, max=0.344, sum=0.344 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 45.68971297084885,
          "description": "min=45.69, mean=45.69, max=45.69, sum=45.69 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.42209514151228233,
          "description": "min=0.422, mean=0.422, max=0.422, sum=0.422 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.4368795751873913,
          "description": "min=0.437, mean=0.437, max=0.437, sum=0.437 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 5.880889887332916,
          "description": "min=5.881, mean=5.881, max=5.881, sum=5.881 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 5.898902521530787,
          "description": "min=5.899, mean=5.899, max=5.899, sum=5.899 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 9.548104664310813,
          "description": "min=9.548, mean=9.548, max=9.548, sum=9.548 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.6024561272612934,
          "description": "min=1.602, mean=1.602, max=1.602, sum=1.602 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 2.68542049407959,
          "description": "min=2.685, mean=2.685, max=2.685, sum=2.685 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 2.600984902352523,
          "description": "min=2.601, mean=2.601, max=2.601, sum=2.601 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 2.896803928217452,
          "description": "min=2.897, mean=2.897, max=2.897, sum=2.897 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 4.660049803039043,
          "description": "min=4.66, mean=4.66, max=4.66, sum=4.66 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.8463265186207465,
          "description": "min=0.823, mean=0.846, max=0.87, sum=1.693 (2)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "med_dialog,subset=icliniq:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.5284792273044586,
          "description": "min=0.528, mean=0.528, max=0.528, sum=0.528 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 4.640198082923889,
          "description": "min=4.64, mean=4.64, max=4.64, sum=4.64 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.2639549596985775,
          "description": "min=1.264, mean=1.264, max=1.264, sum=1.264 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.6905542230606079,
          "description": "min=0.691, mean=0.691, max=0.691, sum=0.691 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.6653454645474752,
          "description": "min=0.665, mean=0.665, max=0.665, sum=0.665 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 8.718091148138047,
          "description": "min=8.718, mean=8.718, max=8.718, sum=8.718 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 3.8335974130630492,
          "description": "min=3.834, mean=3.834, max=3.834, sum=3.834 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.4940158518877896,
          "description": "min=0.494, mean=0.494, max=0.494, sum=0.494 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.32343775355173443,
          "description": "min=0.323, mean=0.323, max=0.323, sum=0.323 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 5.889826661856599,
          "description": "min=5.881, mean=5.89, max=5.903, sum=17.669 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "n2c2_ct_matching:subject=CREATININE,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.6999403750896454,
          "description": "min=1.7, mean=1.7, max=1.7, sum=1.7 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 2.2031820595264433,
          "description": "min=2.203, mean=2.203, max=2.203, sum=2.203 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "value": 3.7981141481195384,
          "description": "min=3.798, mean=3.798, max=3.798, sum=3.798 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 5.85838643527031,
          "description": "min=5.858, mean=5.858, max=5.858, sum=5.858 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.5592270965576172,
          "description": "min=1.559, mean=1.559, max=1.559, sum=1.559 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        }
      ],
      [
        {
          "value": "GPT-4o mini (2024-07-18)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.7830882352941176,
          "style": {
            "font-weight": "bold"
          },
          "markdown": false
        },
        {
          "value": 0.5731263158318994,
          "description": "min=0.573, mean=0.573, max=0.573, sum=0.573 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.8804333776680175,
          "description": "min=0.585, mean=0.88, max=1.126, sum=11.446 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=chronic_pain,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=homelessness,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=liver_disease,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=major_depression,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=personality_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=unemployment,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 4.75874111960916,
          "description": "min=4.759, mean=4.759, max=4.759, sum=4.759 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.8237213220726065,
          "description": "min=0.824, mean=0.824, max=0.824, sum=0.824 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 29.851671765327453,
          "description": "min=27.479, mean=29.852, max=31.687, sum=89.555 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "ehrshot:subject=lab_anemia,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "ehrshot:subject=new_hypertension,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.5146426157951355,
          "description": "min=0.515, mean=0.515, max=0.515, sum=0.515 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.5177228698482761,
          "description": "min=0.518, mean=0.518, max=0.518, sum=0.518 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 46.25818750042243,
          "description": "min=46.258, mean=46.258, max=46.258, sum=46.258 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.5856102826183851,
          "description": "min=0.586, mean=0.586, max=0.586, sum=0.586 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.576928517857536,
          "description": "min=0.577, mean=0.577, max=0.577, sum=0.577 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 5.917445467233658,
          "description": "min=5.917, mean=5.917, max=5.917, sum=5.917 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 4.0863512297471365,
          "description": "min=4.086, mean=4.086, max=4.086, sum=4.086 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 5.294802710413933,
          "description": "min=5.295, mean=5.295, max=5.295, sum=5.295 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.257995302164102,
          "description": "min=1.258, mean=1.258, max=1.258, sum=1.258 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.5203437304496765,
          "description": "min=1.52, mean=1.52, max=1.52, sum=1.52 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 2.092606608627758,
          "description": "min=2.093, mean=2.093, max=2.093, sum=2.093 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 2.111854866384941,
          "description": "min=2.112, mean=2.112, max=2.112, sum=2.112 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 4.490330929241022,
          "description": "min=4.49, mean=4.49, max=4.49, sum=4.49 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.8974741188484632,
          "description": "min=0.896, mean=0.897, max=0.899, sum=1.795 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "med_dialog,subset=icliniq:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.5781944992542267,
          "description": "min=0.578, mean=0.578, max=0.578, sum=0.578 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 3.2412102206548057,
          "description": "min=3.241, mean=3.241, max=3.241, sum=3.241 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.3065443999731718,
          "description": "min=1.307, mean=1.307, max=1.307, sum=1.307 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.66272509654363,
          "description": "min=0.663, mean=0.663, max=0.663, sum=0.663 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.7041426730155945,
          "description": "min=0.704, mean=0.704, max=0.704, sum=0.704 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.6166051857471466,
          "description": "min=0.617, mean=0.617, max=0.617, sum=0.617 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.0685200524330138,
          "description": "min=1.069, mean=1.069, max=1.069, sum=1.069 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.6242298223755577,
          "description": "min=0.624, mean=0.624, max=0.624, sum=0.624 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.4917304216030829,
          "description": "min=0.492, mean=0.492, max=0.492, sum=0.492 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 8.133637910665469,
          "description": "min=7.261, mean=8.134, max=8.6, sum=24.401 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "n2c2_ct_matching:subject=CREATININE,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.6268642969131469,
          "description": "min=0.627, mean=0.627, max=0.627, sum=0.627 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 2.053197555780411,
          "description": "min=2.053, mean=2.053, max=2.053, sum=2.053 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "value": 3.2714920969913495,
          "description": "min=3.271, mean=3.271, max=3.271, sum=3.271 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 5.771784316778183,
          "description": "min=5.772, mean=5.772, max=5.772, sum=5.772 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.463045666217804,
          "description": "min=1.463, mean=1.463, max=1.463, sum=1.463 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        }
      ],
      [
        {
          "value": "Llama 3.3 Instruct (70B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.6286764705882353,
          "markdown": false
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.2965207255424805,
          "description": "min=0.258, mean=0.297, max=0.34, sum=3.855 (13)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=chronic_pain,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=homelessness,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=liver_disease,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=major_depression,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=personality_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=unemployment,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 9.148186227013083,
          "description": "min=9.148, mean=9.148, max=9.148, sum=9.148 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "medec:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 10.951756122191748,
          "description": "min=9.876, mean=10.952, max=11.654, sum=32.855 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "ehrshot:subject=lab_anemia,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "ehrshot:subject=new_hypertension,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 23.10033937908659,
          "description": "min=23.1, mean=23.1, max=23.1, sum=23.1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 3.6418433290846806,
          "description": "min=3.642, mean=3.642, max=3.642, sum=3.642 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 2.073261349579024,
          "description": "min=2.073, mean=2.073, max=2.073, sum=2.073 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 7.064391430616379,
          "description": "min=7.064, mean=7.064, max=7.064, sum=7.064 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 11.603488028049469,
          "description": "min=11.603, mean=11.603, max=11.603, sum=11.603 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 9.823013886809349,
          "description": "min=9.823, mean=9.823, max=9.823, sum=9.823 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 5.954605188125219,
          "description": "min=5.955, mean=5.955, max=5.955, sum=5.955 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 2.7478469467163085,
          "description": "min=2.748, mean=2.748, max=2.748, sum=2.748 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 3.634354764675947,
          "description": "min=3.634, mean=3.634, max=3.634, sum=3.634 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 5.604794421631989,
          "description": "min=5.605, mean=5.605, max=5.605, sum=5.605 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 6.200638982397698,
          "description": "min=6.201, mean=6.201, max=6.201, sum=6.201 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1.3253600368313856,
          "description": "min=1.291, mean=1.325, max=1.359, sum=2.651 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "med_dialog,subset=icliniq:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1.248099797487259,
          "description": "min=1.248, mean=1.248, max=1.248, sum=1.248 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 8.413016567230224,
          "description": "min=8.413, mean=8.413, max=8.413, sum=8.413 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 2.180056931367561,
          "description": "min=2.18, mean=2.18, max=2.18, sum=2.18 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.2374761176109314,
          "description": "min=0.237, mean=0.237, max=0.237, sum=0.237 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.2110820166269938,
          "description": "min=0.211, mean=0.211, max=0.211, sum=0.211 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1.2009574868462303,
          "description": "min=1.201, mean=1.201, max=1.201, sum=1.201 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1.2071963594865427,
          "description": "min=1.117, mean=1.207, max=1.374, sum=3.622 (3)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "n2c2_ct_matching:subject=CREATININE,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 14.256210972547532,
          "description": "min=14.256, mean=14.256, max=14.256, sum=14.256 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "value": 24.56117909253555,
          "description": "min=24.561, mean=24.561, max=24.561, sum=24.561 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 37.005855027914045,
          "description": "min=37.006, mean=37.006, max=37.006, sum=37.006 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 9.610506303310395,
          "description": "min=9.611, mean=9.611, max=9.611, sum=9.611 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        }
      ],
      [
        {
          "value": "o3-mini (2025-01-31)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.27205882352941174,
          "markdown": false
        },
        {
          "value": 12.23940966938351,
          "description": "min=12.239, mean=12.239, max=12.239, sum=12.239 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 2.478334377223607,
          "description": "min=2.001, mean=2.478, max=2.853, sum=32.218 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=bipolar_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=chronic_pain,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=homelessness,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=liver_disease,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=major_depression,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=personality_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=substance_use_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=suicidal_behavior,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=tobacco_dependence,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=unemployment,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 9.537268293044146,
          "description": "min=9.537, mean=9.537, max=9.537, sum=9.537 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 17.57311107540453,
          "description": "min=17.573, mean=17.573, max=17.573, sum=17.573 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 22.073866986036297,
          "description": "min=9.261, mean=22.074, max=40.147, sum=66.222 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "ehrshot:subject=lab_anemia,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "ehrshot:subject=new_hypertension,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 3.9501672561168673,
          "description": "min=3.95, mean=3.95, max=3.95, sum=3.95 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 7.292820972281617,
          "description": "min=7.293, mean=7.293, max=7.293, sum=7.293 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 48.5786174415742,
          "description": "min=48.579, mean=48.579, max=48.579, sum=48.579 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 2.750029217266867,
          "description": "min=2.75, mean=2.75, max=2.75, sum=2.75 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 2.5139477516132627,
          "description": "min=2.514, mean=2.514, max=2.514, sum=2.514 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 10.002931237220764,
          "description": "min=10.003, mean=10.003, max=10.003, sum=10.003 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 6.996356503168742,
          "description": "min=6.996, mean=6.996, max=6.996, sum=6.996 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 9.186064189299941,
          "description": "min=9.186, mean=9.186, max=9.186, sum=9.186 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 4.547026488601521,
          "description": "min=4.547, mean=4.547, max=4.547, sum=4.547 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 4.090529644128048,
          "description": "min=4.091, mean=4.091, max=4.091, sum=4.091 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 10.660015441064228,
          "description": "min=10.66, mean=10.66, max=10.66, sum=10.66 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 7.778176902859581,
          "description": "min=7.778, mean=7.778, max=7.778, sum=7.778 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 12.239962604237395,
          "description": "min=12.24, mean=12.24, max=12.24, sum=12.24 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 3.500650955711766,
          "description": "min=3.271, mean=3.501, max=3.731, sum=7.001 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "med_dialog,subset=icliniq:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 2.8022545745372773,
          "description": "min=2.802, mean=2.802, max=2.802, sum=2.802 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 7.404716361363729,
          "description": "min=7.405, mean=7.405, max=7.405, sum=7.405 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 5.80801716135509,
          "description": "min=5.808, mean=5.808, max=5.808, sum=5.808 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 5.142877056598663,
          "description": "min=5.143, mean=5.143, max=5.143, sum=5.143 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 3.781061518192291,
          "description": "min=3.781, mean=3.781, max=3.781, sum=3.781 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 2.3193285653591156,
          "description": "min=2.319, mean=2.319, max=2.319, sum=2.319 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 9.879440756797791,
          "description": "min=9.879, mean=9.879, max=9.879, sum=9.879 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 3.363296365737915,
          "description": "min=3.363, mean=3.363, max=3.363, sum=3.363 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 6.315420240699174,
          "description": "min=6.315, mean=6.315, max=6.315, sum=6.315 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 8.013198332269063,
          "description": "min=6.503, mean=8.013, max=10.304, sum=24.04 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "n2c2_ct_matching:subject=ADVANCED-CAD,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "n2c2_ct_matching:subject=CREATININE,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 5.188710170269013,
          "description": "min=5.189, mean=5.189, max=5.189, sum=5.189 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 4.608473005533218,
          "description": "min=4.608, mean=4.608, max=4.608, sum=4.608 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "value": 5.290066151808526,
          "description": "min=5.29, mean=5.29, max=5.29, sum=5.29 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 12.857608857393265,
          "description": "min=12.858, mean=12.858, max=12.858, sum=12.858 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 4.357393162965774,
          "description": "min=4.357, mean=4.357, max=4.357, sum=4.357 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        }
      ]
    ],
    "links": [
      {
        "text": "LaTeX",
        "href": "benchmark_output/releases/v2.0.0/groups/latex/medhelm_scenarios_efficiency.tex"
      },
      {
        "text": "JSON",
        "href": "benchmark_output/releases/v2.0.0/groups/json/medhelm_scenarios_efficiency.json"
      }
    ],
    "name": "efficiency"
  },
  {
    "title": "General information",
    "header": [
      {
        "value": "Model",
        "markdown": false,
        "metadata": {}
      },
      {
        "value": "MedCalc-Bench - # eval",
        "description": "A dataset which consists of a patient note, a question requesting to compute a specific medical value, and a ground truth answer [(Khandekar et al., 2024)](https://arxiv.org/abs/2406.12036).\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "MedCalc-Bench"
        }
      },
      {
        "value": "MedCalc-Bench - # train",
        "description": "A dataset which consists of a patient note, a question requesting to compute a specific medical value, and a ground truth answer [(Khandekar et al., 2024)](https://arxiv.org/abs/2406.12036).\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "MedCalc-Bench"
        }
      },
      {
        "value": "MedCalc-Bench - truncated",
        "description": "A dataset which consists of a patient note, a question requesting to compute a specific medical value, and a ground truth answer [(Khandekar et al., 2024)](https://arxiv.org/abs/2406.12036).\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "MedCalc-Bench"
        }
      },
      {
        "value": "MedCalc-Bench - # prompt tokens",
        "description": "A dataset which consists of a patient note, a question requesting to compute a specific medical value, and a ground truth answer [(Khandekar et al., 2024)](https://arxiv.org/abs/2406.12036).\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "MedCalc-Bench"
        }
      },
      {
        "value": "MedCalc-Bench - # output tokens",
        "description": "A dataset which consists of a patient note, a question requesting to compute a specific medical value, and a ground truth answer [(Khandekar et al., 2024)](https://arxiv.org/abs/2406.12036).\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "MedCalc-Bench"
        }
      },
      {
        "value": "CLEAR - # eval",
        "description": "A dataset for evaluating the presence of a specific medical condition from patient notes with yes/no/maybe classifications [(Lopez et al., 2025)](https://www.nature.com/articles/s41746-024-01377-1).\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "CLEAR"
        }
      },
      {
        "value": "CLEAR - # train",
        "description": "A dataset for evaluating the presence of a specific medical condition from patient notes with yes/no/maybe classifications [(Lopez et al., 2025)](https://www.nature.com/articles/s41746-024-01377-1).\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "CLEAR"
        }
      },
      {
        "value": "CLEAR - truncated",
        "description": "A dataset for evaluating the presence of a specific medical condition from patient notes with yes/no/maybe classifications [(Lopez et al., 2025)](https://www.nature.com/articles/s41746-024-01377-1).\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "CLEAR"
        }
      },
      {
        "value": "CLEAR - # prompt tokens",
        "description": "A dataset for evaluating the presence of a specific medical condition from patient notes with yes/no/maybe classifications [(Lopez et al., 2025)](https://www.nature.com/articles/s41746-024-01377-1).\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "CLEAR"
        }
      },
      {
        "value": "CLEAR - # output tokens",
        "description": "A dataset for evaluating the presence of a specific medical condition from patient notes with yes/no/maybe classifications [(Lopez et al., 2025)](https://www.nature.com/articles/s41746-024-01377-1).\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "CLEAR"
        }
      },
      {
        "value": "MTSamples - # eval",
        "description": "A dataset of clinical notes where the model is prompted to generate the appropriate treatment plan for this patient [(MTSamples, 2025)](https://mtsamples.com).\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "MTSamples"
        }
      },
      {
        "value": "MTSamples - # train",
        "description": "A dataset of clinical notes where the model is prompted to generate the appropriate treatment plan for this patient [(MTSamples, 2025)](https://mtsamples.com).\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "MTSamples"
        }
      },
      {
        "value": "MTSamples - truncated",
        "description": "A dataset of clinical notes where the model is prompted to generate the appropriate treatment plan for this patient [(MTSamples, 2025)](https://mtsamples.com).\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "MTSamples"
        }
      },
      {
        "value": "MTSamples - # prompt tokens",
        "description": "A dataset of clinical notes where the model is prompted to generate the appropriate treatment plan for this patient [(MTSamples, 2025)](https://mtsamples.com).\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "MTSamples"
        }
      },
      {
        "value": "MTSamples - # output tokens",
        "description": "A dataset of clinical notes where the model is prompted to generate the appropriate treatment plan for this patient [(MTSamples, 2025)](https://mtsamples.com).\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "MTSamples"
        }
      },
      {
        "value": "Medec - # eval",
        "description": "A dataset containing medical narratives with error detection and correction pairs [(Abacha et al., 2025)](https://arxiv.org/abs/2412.19260).\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "Medec"
        }
      },
      {
        "value": "Medec - # train",
        "description": "A dataset containing medical narratives with error detection and correction pairs [(Abacha et al., 2025)](https://arxiv.org/abs/2412.19260).\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "Medec"
        }
      },
      {
        "value": "Medec - truncated",
        "description": "A dataset containing medical narratives with error detection and correction pairs [(Abacha et al., 2025)](https://arxiv.org/abs/2412.19260).\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "Medec"
        }
      },
      {
        "value": "Medec - # prompt tokens",
        "description": "A dataset containing medical narratives with error detection and correction pairs [(Abacha et al., 2025)](https://arxiv.org/abs/2412.19260).\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "Medec"
        }
      },
      {
        "value": "Medec - # output tokens",
        "description": "A dataset containing medical narratives with error detection and correction pairs [(Abacha et al., 2025)](https://arxiv.org/abs/2412.19260).\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "Medec"
        }
      },
      {
        "value": "EHRSHOT - # eval",
        "description": "A dataset given a patient record of EHR codes, classifying if an event will occur at a future date or not [(Wornow et al., 2023)](https://arxiv.org/abs/2307.02028).\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "EHRSHOT"
        }
      },
      {
        "value": "EHRSHOT - # train",
        "description": "A dataset given a patient record of EHR codes, classifying if an event will occur at a future date or not [(Wornow et al., 2023)](https://arxiv.org/abs/2307.02028).\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "EHRSHOT"
        }
      },
      {
        "value": "EHRSHOT - truncated",
        "description": "A dataset given a patient record of EHR codes, classifying if an event will occur at a future date or not [(Wornow et al., 2023)](https://arxiv.org/abs/2307.02028).\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "EHRSHOT"
        }
      },
      {
        "value": "EHRSHOT - # prompt tokens",
        "description": "A dataset given a patient record of EHR codes, classifying if an event will occur at a future date or not [(Wornow et al., 2023)](https://arxiv.org/abs/2307.02028).\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "EHRSHOT"
        }
      },
      {
        "value": "EHRSHOT - # output tokens",
        "description": "A dataset given a patient record of EHR codes, classifying if an event will occur at a future date or not [(Wornow et al., 2023)](https://arxiv.org/abs/2307.02028).\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "EHRSHOT"
        }
      },
      {
        "value": "HeadQA - # eval",
        "description": "A collection of biomedical multiple-choice questions for testing medical knowledge [(Vilares et al., 2019)](https://arxiv.org/abs/1906.04701).\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "HeadQA"
        }
      },
      {
        "value": "HeadQA - # train",
        "description": "A collection of biomedical multiple-choice questions for testing medical knowledge [(Vilares et al., 2019)](https://arxiv.org/abs/1906.04701).\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "HeadQA"
        }
      },
      {
        "value": "HeadQA - truncated",
        "description": "A collection of biomedical multiple-choice questions for testing medical knowledge [(Vilares et al., 2019)](https://arxiv.org/abs/1906.04701).\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "HeadQA"
        }
      },
      {
        "value": "HeadQA - # prompt tokens",
        "description": "A collection of biomedical multiple-choice questions for testing medical knowledge [(Vilares et al., 2019)](https://arxiv.org/abs/1906.04701).\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "HeadQA"
        }
      },
      {
        "value": "HeadQA - # output tokens",
        "description": "A collection of biomedical multiple-choice questions for testing medical knowledge [(Vilares et al., 2019)](https://arxiv.org/abs/1906.04701).\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "HeadQA"
        }
      },
      {
        "value": "Medbullets - # eval",
        "description": "A USMLE-style medical question dataset with multiple-choice answers and explanations [(MedBullets, 2025)](https://step2.medbullets.com).\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "Medbullets"
        }
      },
      {
        "value": "Medbullets - # train",
        "description": "A USMLE-style medical question dataset with multiple-choice answers and explanations [(MedBullets, 2025)](https://step2.medbullets.com).\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "Medbullets"
        }
      },
      {
        "value": "Medbullets - truncated",
        "description": "A USMLE-style medical question dataset with multiple-choice answers and explanations [(MedBullets, 2025)](https://step2.medbullets.com).\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "Medbullets"
        }
      },
      {
        "value": "Medbullets - # prompt tokens",
        "description": "A USMLE-style medical question dataset with multiple-choice answers and explanations [(MedBullets, 2025)](https://step2.medbullets.com).\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "Medbullets"
        }
      },
      {
        "value": "Medbullets - # output tokens",
        "description": "A USMLE-style medical question dataset with multiple-choice answers and explanations [(MedBullets, 2025)](https://step2.medbullets.com).\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "Medbullets"
        }
      },
      {
        "value": "MedAlign - # eval",
        "description": "A dataset that asks models to answer questions/follow instructions over longitudinal EHR [(Fleming et al., 2023)](https://arxiv.org/abs/2308.14089).\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "MedAlign"
        }
      },
      {
        "value": "MedAlign - # train",
        "description": "A dataset that asks models to answer questions/follow instructions over longitudinal EHR [(Fleming et al., 2023)](https://arxiv.org/abs/2308.14089).\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "MedAlign"
        }
      },
      {
        "value": "MedAlign - truncated",
        "description": "A dataset that asks models to answer questions/follow instructions over longitudinal EHR [(Fleming et al., 2023)](https://arxiv.org/abs/2308.14089).\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "MedAlign"
        }
      },
      {
        "value": "MedAlign - # prompt tokens",
        "description": "A dataset that asks models to answer questions/follow instructions over longitudinal EHR [(Fleming et al., 2023)](https://arxiv.org/abs/2308.14089).\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "MedAlign"
        }
      },
      {
        "value": "MedAlign - # output tokens",
        "description": "A dataset that asks models to answer questions/follow instructions over longitudinal EHR [(Fleming et al., 2023)](https://arxiv.org/abs/2308.14089).\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "MedAlign"
        }
      },
      {
        "value": "ADHD-Behavior - # eval",
        "description": "A dataset that classifies whether a clinical note contains a clinician recommendation for parent training in behavior management, which is the first-line evidence-based treatment for young children with ADHD [(Pillai et al., 2024)](https://doi.org/10.1093/jamia/ocae001).\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "ADHD-Behavior"
        }
      },
      {
        "value": "ADHD-Behavior - # train",
        "description": "A dataset that classifies whether a clinical note contains a clinician recommendation for parent training in behavior management, which is the first-line evidence-based treatment for young children with ADHD [(Pillai et al., 2024)](https://doi.org/10.1093/jamia/ocae001).\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "ADHD-Behavior"
        }
      },
      {
        "value": "ADHD-Behavior - truncated",
        "description": "A dataset that classifies whether a clinical note contains a clinician recommendation for parent training in behavior management, which is the first-line evidence-based treatment for young children with ADHD [(Pillai et al., 2024)](https://doi.org/10.1093/jamia/ocae001).\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "ADHD-Behavior"
        }
      },
      {
        "value": "ADHD-Behavior - # prompt tokens",
        "description": "A dataset that classifies whether a clinical note contains a clinician recommendation for parent training in behavior management, which is the first-line evidence-based treatment for young children with ADHD [(Pillai et al., 2024)](https://doi.org/10.1093/jamia/ocae001).\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "ADHD-Behavior"
        }
      },
      {
        "value": "ADHD-Behavior - # output tokens",
        "description": "A dataset that classifies whether a clinical note contains a clinician recommendation for parent training in behavior management, which is the first-line evidence-based treatment for young children with ADHD [(Pillai et al., 2024)](https://doi.org/10.1093/jamia/ocae001).\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "ADHD-Behavior"
        }
      },
      {
        "value": "ADHD-MedEffects - # eval",
        "description": "A dataset that classifies whether a clinical note contains documentation of side effect monitoring (recording of absence or presence of medication side effects), as recommended in clinical practice guidelines [(Bannet et al., 2024)](https://doi.org/10.1542/peds.2024-067223).\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "ADHD-MedEffects"
        }
      },
      {
        "value": "ADHD-MedEffects - # train",
        "description": "A dataset that classifies whether a clinical note contains documentation of side effect monitoring (recording of absence or presence of medication side effects), as recommended in clinical practice guidelines [(Bannet et al., 2024)](https://doi.org/10.1542/peds.2024-067223).\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "ADHD-MedEffects"
        }
      },
      {
        "value": "ADHD-MedEffects - truncated",
        "description": "A dataset that classifies whether a clinical note contains documentation of side effect monitoring (recording of absence or presence of medication side effects), as recommended in clinical practice guidelines [(Bannet et al., 2024)](https://doi.org/10.1542/peds.2024-067223).\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "ADHD-MedEffects"
        }
      },
      {
        "value": "ADHD-MedEffects - # prompt tokens",
        "description": "A dataset that classifies whether a clinical note contains documentation of side effect monitoring (recording of absence or presence of medication side effects), as recommended in clinical practice guidelines [(Bannet et al., 2024)](https://doi.org/10.1542/peds.2024-067223).\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "ADHD-MedEffects"
        }
      },
      {
        "value": "ADHD-MedEffects - # output tokens",
        "description": "A dataset that classifies whether a clinical note contains documentation of side effect monitoring (recording of absence or presence of medication side effects), as recommended in clinical practice guidelines [(Bannet et al., 2024)](https://doi.org/10.1542/peds.2024-067223).\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "ADHD-MedEffects"
        }
      },
      {
        "value": "DischargeMe - # eval",
        "description": "DischargeMe is a discharge instruction generation dataset and brief hospital course generation dataset collected from MIMIC-IV data, considering only the discharge text as well as the radiology report text [(Xu, 2024)](https://physionet.org/content/discharge-me/1.3/).\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "DischargeMe"
        }
      },
      {
        "value": "DischargeMe - # train",
        "description": "DischargeMe is a discharge instruction generation dataset and brief hospital course generation dataset collected from MIMIC-IV data, considering only the discharge text as well as the radiology report text [(Xu, 2024)](https://physionet.org/content/discharge-me/1.3/).\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "DischargeMe"
        }
      },
      {
        "value": "DischargeMe - truncated",
        "description": "DischargeMe is a discharge instruction generation dataset and brief hospital course generation dataset collected from MIMIC-IV data, considering only the discharge text as well as the radiology report text [(Xu, 2024)](https://physionet.org/content/discharge-me/1.3/).\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "DischargeMe"
        }
      },
      {
        "value": "DischargeMe - # prompt tokens",
        "description": "DischargeMe is a discharge instruction generation dataset and brief hospital course generation dataset collected from MIMIC-IV data, considering only the discharge text as well as the radiology report text [(Xu, 2024)](https://physionet.org/content/discharge-me/1.3/).\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "DischargeMe"
        }
      },
      {
        "value": "DischargeMe - # output tokens",
        "description": "DischargeMe is a discharge instruction generation dataset and brief hospital course generation dataset collected from MIMIC-IV data, considering only the discharge text as well as the radiology report text [(Xu, 2024)](https://physionet.org/content/discharge-me/1.3/).\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "DischargeMe"
        }
      },
      {
        "value": "ACI-Bench - # eval",
        "description": "A dataset of patient-doctor conversations paired with structured clinical notes [(Yim et al., 2024)](https://www.nature.com/articles/s41597-023-02487-3).\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "ACI-Bench"
        }
      },
      {
        "value": "ACI-Bench - # train",
        "description": "A dataset of patient-doctor conversations paired with structured clinical notes [(Yim et al., 2024)](https://www.nature.com/articles/s41597-023-02487-3).\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "ACI-Bench"
        }
      },
      {
        "value": "ACI-Bench - truncated",
        "description": "A dataset of patient-doctor conversations paired with structured clinical notes [(Yim et al., 2024)](https://www.nature.com/articles/s41597-023-02487-3).\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "ACI-Bench"
        }
      },
      {
        "value": "ACI-Bench - # prompt tokens",
        "description": "A dataset of patient-doctor conversations paired with structured clinical notes [(Yim et al., 2024)](https://www.nature.com/articles/s41597-023-02487-3).\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "ACI-Bench"
        }
      },
      {
        "value": "ACI-Bench - # output tokens",
        "description": "A dataset of patient-doctor conversations paired with structured clinical notes [(Yim et al., 2024)](https://www.nature.com/articles/s41597-023-02487-3).\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "ACI-Bench"
        }
      },
      {
        "value": "MTSamples Procedures - # eval",
        "description": "A dataset that provides a patient note regarding an operation, with the objective to document the procedure.\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "MTSamples Procedures"
        }
      },
      {
        "value": "MTSamples Procedures - # train",
        "description": "A dataset that provides a patient note regarding an operation, with the objective to document the procedure.\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "MTSamples Procedures"
        }
      },
      {
        "value": "MTSamples Procedures - truncated",
        "description": "A dataset that provides a patient note regarding an operation, with the objective to document the procedure.\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "MTSamples Procedures"
        }
      },
      {
        "value": "MTSamples Procedures - # prompt tokens",
        "description": "A dataset that provides a patient note regarding an operation, with the objective to document the procedure.\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "MTSamples Procedures"
        }
      },
      {
        "value": "MTSamples Procedures - # output tokens",
        "description": "A dataset that provides a patient note regarding an operation, with the objective to document the procedure.\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "MTSamples Procedures"
        }
      },
      {
        "value": "MIMIC-RRS - # eval",
        "description": "A dataset containing radiology reports with findings sections from MIMIC-III paired with their corresponding impression sections, used for generating radiology report summaries [(Chen et al., 2023)](https://arxiv.org/abs/2211.08584).\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "MIMIC-RRS"
        }
      },
      {
        "value": "MIMIC-RRS - # train",
        "description": "A dataset containing radiology reports with findings sections from MIMIC-III paired with their corresponding impression sections, used for generating radiology report summaries [(Chen et al., 2023)](https://arxiv.org/abs/2211.08584).\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "MIMIC-RRS"
        }
      },
      {
        "value": "MIMIC-RRS - truncated",
        "description": "A dataset containing radiology reports with findings sections from MIMIC-III paired with their corresponding impression sections, used for generating radiology report summaries [(Chen et al., 2023)](https://arxiv.org/abs/2211.08584).\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "MIMIC-RRS"
        }
      },
      {
        "value": "MIMIC-RRS - # prompt tokens",
        "description": "A dataset containing radiology reports with findings sections from MIMIC-III paired with their corresponding impression sections, used for generating radiology report summaries [(Chen et al., 2023)](https://arxiv.org/abs/2211.08584).\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "MIMIC-RRS"
        }
      },
      {
        "value": "MIMIC-RRS - # output tokens",
        "description": "A dataset containing radiology reports with findings sections from MIMIC-III paired with their corresponding impression sections, used for generating radiology report summaries [(Chen et al., 2023)](https://arxiv.org/abs/2211.08584).\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "MIMIC-RRS"
        }
      },
      {
        "value": "MIMIC-BHC - # eval",
        "description": "A summarization task using a curated collection of preprocessed discharge notes paired with their corresponding brief hospital course (BHC) summaries [(Aali et al., 2024)](https://doi.org/10.1093/jamia/ocae312).\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "MIMIC-BHC"
        }
      },
      {
        "value": "MIMIC-BHC - # train",
        "description": "A summarization task using a curated collection of preprocessed discharge notes paired with their corresponding brief hospital course (BHC) summaries [(Aali et al., 2024)](https://doi.org/10.1093/jamia/ocae312).\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "MIMIC-BHC"
        }
      },
      {
        "value": "MIMIC-BHC - truncated",
        "description": "A summarization task using a curated collection of preprocessed discharge notes paired with their corresponding brief hospital course (BHC) summaries [(Aali et al., 2024)](https://doi.org/10.1093/jamia/ocae312).\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "MIMIC-BHC"
        }
      },
      {
        "value": "MIMIC-BHC - # prompt tokens",
        "description": "A summarization task using a curated collection of preprocessed discharge notes paired with their corresponding brief hospital course (BHC) summaries [(Aali et al., 2024)](https://doi.org/10.1093/jamia/ocae312).\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "MIMIC-BHC"
        }
      },
      {
        "value": "MIMIC-BHC - # output tokens",
        "description": "A summarization task using a curated collection of preprocessed discharge notes paired with their corresponding brief hospital course (BHC) summaries [(Aali et al., 2024)](https://doi.org/10.1093/jamia/ocae312).\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "MIMIC-BHC"
        }
      },
      {
        "value": "NoteExtract - # eval",
        "description": "A dataset containing free form text of a clinical health worker care plan, with the associated goal being to restructure that text into a given format.\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "NoteExtract"
        }
      },
      {
        "value": "NoteExtract - # train",
        "description": "A dataset containing free form text of a clinical health worker care plan, with the associated goal being to restructure that text into a given format.\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "NoteExtract"
        }
      },
      {
        "value": "NoteExtract - truncated",
        "description": "A dataset containing free form text of a clinical health worker care plan, with the associated goal being to restructure that text into a given format.\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "NoteExtract"
        }
      },
      {
        "value": "NoteExtract - # prompt tokens",
        "description": "A dataset containing free form text of a clinical health worker care plan, with the associated goal being to restructure that text into a given format.\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "NoteExtract"
        }
      },
      {
        "value": "NoteExtract - # output tokens",
        "description": "A dataset containing free form text of a clinical health worker care plan, with the associated goal being to restructure that text into a given format.\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "NoteExtract"
        }
      },
      {
        "value": "MedicationQA - # eval",
        "description": "Consumer medication questions with reference answers.\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "MedicationQA"
        }
      },
      {
        "value": "MedicationQA - # train",
        "description": "Consumer medication questions with reference answers.\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "MedicationQA"
        }
      },
      {
        "value": "MedicationQA - truncated",
        "description": "Consumer medication questions with reference answers.\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "MedicationQA"
        }
      },
      {
        "value": "MedicationQA - # prompt tokens",
        "description": "Consumer medication questions with reference answers.\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "MedicationQA"
        }
      },
      {
        "value": "MedicationQA - # output tokens",
        "description": "Consumer medication questions with reference answers.\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "MedicationQA"
        }
      },
      {
        "value": "PatientInstruct - # eval",
        "description": "A dataset containing case details used to generate customized post-procedure patient instructions.\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "PatientInstruct"
        }
      },
      {
        "value": "PatientInstruct - # train",
        "description": "A dataset containing case details used to generate customized post-procedure patient instructions.\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "PatientInstruct"
        }
      },
      {
        "value": "PatientInstruct - truncated",
        "description": "A dataset containing case details used to generate customized post-procedure patient instructions.\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "PatientInstruct"
        }
      },
      {
        "value": "PatientInstruct - # prompt tokens",
        "description": "A dataset containing case details used to generate customized post-procedure patient instructions.\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "PatientInstruct"
        }
      },
      {
        "value": "PatientInstruct - # output tokens",
        "description": "A dataset containing case details used to generate customized post-procedure patient instructions.\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "PatientInstruct"
        }
      },
      {
        "value": "MedDialog - # eval",
        "description": "A collection of doctor-patient conversations with corresponding summaries.\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "MedDialog"
        }
      },
      {
        "value": "MedDialog - # train",
        "description": "A collection of doctor-patient conversations with corresponding summaries.\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "MedDialog"
        }
      },
      {
        "value": "MedDialog - truncated",
        "description": "A collection of doctor-patient conversations with corresponding summaries.\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "MedDialog"
        }
      },
      {
        "value": "MedDialog - # prompt tokens",
        "description": "A collection of doctor-patient conversations with corresponding summaries.\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "MedDialog"
        }
      },
      {
        "value": "MedDialog - # output tokens",
        "description": "A collection of doctor-patient conversations with corresponding summaries.\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "MedDialog"
        }
      },
      {
        "value": "MedConfInfo - # eval",
        "description": "A dataset of clinical notes from adolescent patients used to identify sensitive protected health information that should be restricted from parental access [(Rabbani et al., 2024)](https://jamanetwork.com/journals/jamapediatrics/fullarticle/2814109).\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "MedConfInfo"
        }
      },
      {
        "value": "MedConfInfo - # train",
        "description": "A dataset of clinical notes from adolescent patients used to identify sensitive protected health information that should be restricted from parental access [(Rabbani et al., 2024)](https://jamanetwork.com/journals/jamapediatrics/fullarticle/2814109).\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "MedConfInfo"
        }
      },
      {
        "value": "MedConfInfo - truncated",
        "description": "A dataset of clinical notes from adolescent patients used to identify sensitive protected health information that should be restricted from parental access [(Rabbani et al., 2024)](https://jamanetwork.com/journals/jamapediatrics/fullarticle/2814109).\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "MedConfInfo"
        }
      },
      {
        "value": "MedConfInfo - # prompt tokens",
        "description": "A dataset of clinical notes from adolescent patients used to identify sensitive protected health information that should be restricted from parental access [(Rabbani et al., 2024)](https://jamanetwork.com/journals/jamapediatrics/fullarticle/2814109).\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "MedConfInfo"
        }
      },
      {
        "value": "MedConfInfo - # output tokens",
        "description": "A dataset of clinical notes from adolescent patients used to identify sensitive protected health information that should be restricted from parental access [(Rabbani et al., 2024)](https://jamanetwork.com/journals/jamapediatrics/fullarticle/2814109).\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "MedConfInfo"
        }
      },
      {
        "value": "MEDIQA - # eval",
        "description": "A dataset including a medical question, a set of candidate answers, relevance annotations for ranking, and additional context to evaluate understanding and retrieval capabilities in a healthcare setting.\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "MEDIQA"
        }
      },
      {
        "value": "MEDIQA - # train",
        "description": "A dataset including a medical question, a set of candidate answers, relevance annotations for ranking, and additional context to evaluate understanding and retrieval capabilities in a healthcare setting.\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "MEDIQA"
        }
      },
      {
        "value": "MEDIQA - truncated",
        "description": "A dataset including a medical question, a set of candidate answers, relevance annotations for ranking, and additional context to evaluate understanding and retrieval capabilities in a healthcare setting.\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "MEDIQA"
        }
      },
      {
        "value": "MEDIQA - # prompt tokens",
        "description": "A dataset including a medical question, a set of candidate answers, relevance annotations for ranking, and additional context to evaluate understanding and retrieval capabilities in a healthcare setting.\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "MEDIQA"
        }
      },
      {
        "value": "MEDIQA - # output tokens",
        "description": "A dataset including a medical question, a set of candidate answers, relevance annotations for ranking, and additional context to evaluate understanding and retrieval capabilities in a healthcare setting.\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "MEDIQA"
        }
      },
      {
        "value": "MentalHealth - # eval",
        "description": "A dataset containing a counselor and mental health patient conversation, where the objective is to generate an empathetic counselor response.\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "MentalHealth"
        }
      },
      {
        "value": "MentalHealth - # train",
        "description": "A dataset containing a counselor and mental health patient conversation, where the objective is to generate an empathetic counselor response.\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "MentalHealth"
        }
      },
      {
        "value": "MentalHealth - truncated",
        "description": "A dataset containing a counselor and mental health patient conversation, where the objective is to generate an empathetic counselor response.\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "MentalHealth"
        }
      },
      {
        "value": "MentalHealth - # prompt tokens",
        "description": "A dataset containing a counselor and mental health patient conversation, where the objective is to generate an empathetic counselor response.\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "MentalHealth"
        }
      },
      {
        "value": "MentalHealth - # output tokens",
        "description": "A dataset containing a counselor and mental health patient conversation, where the objective is to generate an empathetic counselor response.\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "MentalHealth"
        }
      },
      {
        "value": "ProxySender - # eval",
        "description": "This dataset features messages sent by proxy users and non proxy users, for evaluation of LLM capabilities to determine the sender [(Tse G, et al., 2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "ProxySender"
        }
      },
      {
        "value": "ProxySender - # train",
        "description": "This dataset features messages sent by proxy users and non proxy users, for evaluation of LLM capabilities to determine the sender [(Tse G, et al., 2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "ProxySender"
        }
      },
      {
        "value": "ProxySender - truncated",
        "description": "This dataset features messages sent by proxy users and non proxy users, for evaluation of LLM capabilities to determine the sender [(Tse G, et al., 2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "ProxySender"
        }
      },
      {
        "value": "ProxySender - # prompt tokens",
        "description": "This dataset features messages sent by proxy users and non proxy users, for evaluation of LLM capabilities to determine the sender [(Tse G, et al., 2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "ProxySender"
        }
      },
      {
        "value": "ProxySender - # output tokens",
        "description": "This dataset features messages sent by proxy users and non proxy users, for evaluation of LLM capabilities to determine the sender [(Tse G, et al., 2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "ProxySender"
        }
      },
      {
        "value": "PrivacyDetection - # eval",
        "description": "A dataset that determines if a message leaks any confidential information from the patient [(Tse G, et al., 2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "PrivacyDetection"
        }
      },
      {
        "value": "PrivacyDetection - # train",
        "description": "A dataset that determines if a message leaks any confidential information from the patient [(Tse G, et al., 2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "PrivacyDetection"
        }
      },
      {
        "value": "PrivacyDetection - truncated",
        "description": "A dataset that determines if a message leaks any confidential information from the patient [(Tse G, et al., 2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "PrivacyDetection"
        }
      },
      {
        "value": "PrivacyDetection - # prompt tokens",
        "description": "A dataset that determines if a message leaks any confidential information from the patient [(Tse G, et al., 2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "PrivacyDetection"
        }
      },
      {
        "value": "PrivacyDetection - # output tokens",
        "description": "A dataset that determines if a message leaks any confidential information from the patient [(Tse G, et al., 2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "PrivacyDetection"
        }
      },
      {
        "value": "PubMedQA - # eval",
        "description": "A dataset that provides pubmed abstracts and asks associated questions yes/no/maybe questions.\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "PubMedQA"
        }
      },
      {
        "value": "PubMedQA - # train",
        "description": "A dataset that provides pubmed abstracts and asks associated questions yes/no/maybe questions.\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "PubMedQA"
        }
      },
      {
        "value": "PubMedQA - truncated",
        "description": "A dataset that provides pubmed abstracts and asks associated questions yes/no/maybe questions.\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "PubMedQA"
        }
      },
      {
        "value": "PubMedQA - # prompt tokens",
        "description": "A dataset that provides pubmed abstracts and asks associated questions yes/no/maybe questions.\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "PubMedQA"
        }
      },
      {
        "value": "PubMedQA - # output tokens",
        "description": "A dataset that provides pubmed abstracts and asks associated questions yes/no/maybe questions.\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "PubMedQA"
        }
      },
      {
        "value": "EHRSQL - # eval",
        "description": "Given a natural language instruction, generate an SQL query that would be used in clinical research.\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "EHRSQL"
        }
      },
      {
        "value": "EHRSQL - # train",
        "description": "Given a natural language instruction, generate an SQL query that would be used in clinical research.\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "EHRSQL"
        }
      },
      {
        "value": "EHRSQL - truncated",
        "description": "Given a natural language instruction, generate an SQL query that would be used in clinical research.\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "EHRSQL"
        }
      },
      {
        "value": "EHRSQL - # prompt tokens",
        "description": "Given a natural language instruction, generate an SQL query that would be used in clinical research.\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "EHRSQL"
        }
      },
      {
        "value": "EHRSQL - # output tokens",
        "description": "Given a natural language instruction, generate an SQL query that would be used in clinical research.\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "EHRSQL"
        }
      },
      {
        "value": "BMT-Status - # eval",
        "description": "A dataset containing patient notes with associated questions and answers related to bone marrow transplantation.\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "BMT-Status"
        }
      },
      {
        "value": "BMT-Status - # train",
        "description": "A dataset containing patient notes with associated questions and answers related to bone marrow transplantation.\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "BMT-Status"
        }
      },
      {
        "value": "BMT-Status - truncated",
        "description": "A dataset containing patient notes with associated questions and answers related to bone marrow transplantation.\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "BMT-Status"
        }
      },
      {
        "value": "BMT-Status - # prompt tokens",
        "description": "A dataset containing patient notes with associated questions and answers related to bone marrow transplantation.\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "BMT-Status"
        }
      },
      {
        "value": "BMT-Status - # output tokens",
        "description": "A dataset containing patient notes with associated questions and answers related to bone marrow transplantation.\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "BMT-Status"
        }
      },
      {
        "value": "RaceBias - # eval",
        "description": "A collection of LLM outputs in response to medical questions with race-based biases, with the objective being to classify whether the output contains racially biased content.\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "RaceBias"
        }
      },
      {
        "value": "RaceBias - # train",
        "description": "A collection of LLM outputs in response to medical questions with race-based biases, with the objective being to classify whether the output contains racially biased content.\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "RaceBias"
        }
      },
      {
        "value": "RaceBias - truncated",
        "description": "A collection of LLM outputs in response to medical questions with race-based biases, with the objective being to classify whether the output contains racially biased content.\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "RaceBias"
        }
      },
      {
        "value": "RaceBias - # prompt tokens",
        "description": "A collection of LLM outputs in response to medical questions with race-based biases, with the objective being to classify whether the output contains racially biased content.\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "RaceBias"
        }
      },
      {
        "value": "RaceBias - # output tokens",
        "description": "A collection of LLM outputs in response to medical questions with race-based biases, with the objective being to classify whether the output contains racially biased content.\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "RaceBias"
        }
      },
      {
        "value": "N2C2-CT - # eval",
        "description": "A dataset that provides clinical notes and asks the model to classify whether the patient is a valid candidate for a provided clinical trial.\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "N2C2-CT"
        }
      },
      {
        "value": "N2C2-CT - # train",
        "description": "A dataset that provides clinical notes and asks the model to classify whether the patient is a valid candidate for a provided clinical trial.\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "N2C2-CT"
        }
      },
      {
        "value": "N2C2-CT - truncated",
        "description": "A dataset that provides clinical notes and asks the model to classify whether the patient is a valid candidate for a provided clinical trial.\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "N2C2-CT"
        }
      },
      {
        "value": "N2C2-CT - # prompt tokens",
        "description": "A dataset that provides clinical notes and asks the model to classify whether the patient is a valid candidate for a provided clinical trial.\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "N2C2-CT"
        }
      },
      {
        "value": "N2C2-CT - # output tokens",
        "description": "A dataset that provides clinical notes and asks the model to classify whether the patient is a valid candidate for a provided clinical trial.\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "N2C2-CT"
        }
      },
      {
        "value": "MedHallu - # eval",
        "description": "A dataset of PubMed articles and associated questions, with the objective being to classify whether the answer is factual or hallucinated.\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "MedHallu"
        }
      },
      {
        "value": "MedHallu - # train",
        "description": "A dataset of PubMed articles and associated questions, with the objective being to classify whether the answer is factual or hallucinated.\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "MedHallu"
        }
      },
      {
        "value": "MedHallu - truncated",
        "description": "A dataset of PubMed articles and associated questions, with the objective being to classify whether the answer is factual or hallucinated.\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "MedHallu"
        }
      },
      {
        "value": "MedHallu - # prompt tokens",
        "description": "A dataset of PubMed articles and associated questions, with the objective being to classify whether the answer is factual or hallucinated.\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "MedHallu"
        }
      },
      {
        "value": "MedHallu - # output tokens",
        "description": "A dataset of PubMed articles and associated questions, with the objective being to classify whether the answer is factual or hallucinated.\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "MedHallu"
        }
      },
      {
        "value": "HospiceReferral - # eval",
        "description": "A dataset evaluating performance in identifying appropriate patient referrals to hospice care.\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "HospiceReferral"
        }
      },
      {
        "value": "HospiceReferral - # train",
        "description": "A dataset evaluating performance in identifying appropriate patient referrals to hospice care.\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "HospiceReferral"
        }
      },
      {
        "value": "HospiceReferral - truncated",
        "description": "A dataset evaluating performance in identifying appropriate patient referrals to hospice care.\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "HospiceReferral"
        }
      },
      {
        "value": "HospiceReferral - # prompt tokens",
        "description": "A dataset evaluating performance in identifying appropriate patient referrals to hospice care.\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "HospiceReferral"
        }
      },
      {
        "value": "HospiceReferral - # output tokens",
        "description": "A dataset evaluating performance in identifying appropriate patient referrals to hospice care.\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "HospiceReferral"
        }
      },
      {
        "value": "MIMIC-IV Billing Code - # eval",
        "description": "A  dataset pairing clinical notes from MIMIC-IV with corresponding ICD-10 billing codes.\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "MIMIC-IV Billing Code"
        }
      },
      {
        "value": "MIMIC-IV Billing Code - # train",
        "description": "A  dataset pairing clinical notes from MIMIC-IV with corresponding ICD-10 billing codes.\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "MIMIC-IV Billing Code"
        }
      },
      {
        "value": "MIMIC-IV Billing Code - truncated",
        "description": "A  dataset pairing clinical notes from MIMIC-IV with corresponding ICD-10 billing codes.\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "MIMIC-IV Billing Code"
        }
      },
      {
        "value": "MIMIC-IV Billing Code - # prompt tokens",
        "description": "A  dataset pairing clinical notes from MIMIC-IV with corresponding ICD-10 billing codes.\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "MIMIC-IV Billing Code"
        }
      },
      {
        "value": "MIMIC-IV Billing Code - # output tokens",
        "description": "A  dataset pairing clinical notes from MIMIC-IV with corresponding ICD-10 billing codes.\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "MIMIC-IV Billing Code"
        }
      },
      {
        "value": "ClinicReferral - # eval",
        "description": "A dataset containing manually curated answers to questions regarding patient referrals to the Sequoia clinic.\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "ClinicReferral"
        }
      },
      {
        "value": "ClinicReferral - # train",
        "description": "A dataset containing manually curated answers to questions regarding patient referrals to the Sequoia clinic.\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "ClinicReferral"
        }
      },
      {
        "value": "ClinicReferral - truncated",
        "description": "A dataset containing manually curated answers to questions regarding patient referrals to the Sequoia clinic.\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "ClinicReferral"
        }
      },
      {
        "value": "ClinicReferral - # prompt tokens",
        "description": "A dataset containing manually curated answers to questions regarding patient referrals to the Sequoia clinic.\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "ClinicReferral"
        }
      },
      {
        "value": "ClinicReferral - # output tokens",
        "description": "A dataset containing manually curated answers to questions regarding patient referrals to the Sequoia clinic.\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "ClinicReferral"
        }
      },
      {
        "value": "CDI-QA - # eval",
        "description": "A dataset built from Clinical Document Integrity (CDI) notes, to assess the ability to answer verification questions from previous notes.\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "CDI-QA"
        }
      },
      {
        "value": "CDI-QA - # train",
        "description": "A dataset built from Clinical Document Integrity (CDI) notes, to assess the ability to answer verification questions from previous notes.\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "CDI-QA"
        }
      },
      {
        "value": "CDI-QA - truncated",
        "description": "A dataset built from Clinical Document Integrity (CDI) notes, to assess the ability to answer verification questions from previous notes.\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "CDI-QA"
        }
      },
      {
        "value": "CDI-QA - # prompt tokens",
        "description": "A dataset built from Clinical Document Integrity (CDI) notes, to assess the ability to answer verification questions from previous notes.\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "CDI-QA"
        }
      },
      {
        "value": "CDI-QA - # output tokens",
        "description": "A dataset built from Clinical Document Integrity (CDI) notes, to assess the ability to answer verification questions from previous notes.\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "CDI-QA"
        }
      },
      {
        "value": "ENT-Referral - # eval",
        "description": "A dataset designed to evaluate performance in identifying appropriate patient referrals to Ear, Nose, and Throat specialists.\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "ENT-Referral"
        }
      },
      {
        "value": "ENT-Referral - # train",
        "description": "A dataset designed to evaluate performance in identifying appropriate patient referrals to Ear, Nose, and Throat specialists.\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "ENT-Referral"
        }
      },
      {
        "value": "ENT-Referral - truncated",
        "description": "A dataset designed to evaluate performance in identifying appropriate patient referrals to Ear, Nose, and Throat specialists.\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "ENT-Referral"
        }
      },
      {
        "value": "ENT-Referral - # prompt tokens",
        "description": "A dataset designed to evaluate performance in identifying appropriate patient referrals to Ear, Nose, and Throat specialists.\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "ENT-Referral"
        }
      },
      {
        "value": "ENT-Referral - # output tokens",
        "description": "A dataset designed to evaluate performance in identifying appropriate patient referrals to Ear, Nose, and Throat specialists.\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "ENT-Referral"
        }
      }
    ],
    "rows": [
      [
        {
          "value": "Claude 3.5 Sonnet (20241022)",
          "description": "",
          "markdown": false
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 579.799,
          "description": "min=579.799, mean=579.799, max=579.799, sum=579.799 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 2.397,
          "description": "min=2.397, mean=2.397, max=2.397, sum=2.397 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 78.61538461538461,
          "description": "min=28, mean=78.615, max=100, sum=1022 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=chronic_pain,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=homelessness,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=liver_disease,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=major_depression,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=personality_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=unemployment,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=chronic_pain,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=homelessness,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=liver_disease,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=major_depression,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=personality_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=unemployment,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=chronic_pain,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=homelessness,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=liver_disease,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=major_depression,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=personality_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=unemployment,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 480.8413416196609,
          "description": "min=446.011, mean=480.841, max=525.657, sum=6250.937 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=chronic_pain,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=homelessness,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=liver_disease,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=major_depression,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=personality_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=unemployment,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=chronic_pain,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=homelessness,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=liver_disease,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=major_depression,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=personality_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "clear:condition=unemployment,max_eval_instances=100,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 427.0,
          "description": "min=427, mean=427, max=427, sum=427 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 794.0585480093677,
          "description": "min=794.059, mean=794.059, max=794.059, sum=794.059 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 597.0,
          "description": "min=597, mean=597, max=597, sum=597 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 320.5175879396985,
          "description": "min=320.518, mean=320.518, max=320.518, sum=320.518 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 115.89614740368509,
          "description": "min=115.896, mean=115.896, max=115.896, sum=115.896 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "ehrshot:subject=lab_anemia,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "ehrshot:subject=new_hypertension,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "ehrshot:subject=lab_anemia,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "ehrshot:subject=new_hypertension,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "ehrshot:subject=lab_anemia,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "ehrshot:subject=new_hypertension,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 33185.73533333334,
          "description": "min=29931.775, mean=33185.735, max=35478.742, sum=99557.206 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "ehrshot:subject=lab_anemia,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "ehrshot:subject=new_hypertension,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "ehrshot:subject=lab_anemia,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "ehrshot:subject=new_hypertension,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 163.022,
          "description": "min=163.022, mean=163.022, max=163.022, sum=163.022 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 308.0,
          "description": "min=308, mean=308, max=308, sum=308 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 345.6266233766234,
          "description": "min=345.627, mean=345.627, max=345.627, sum=345.627 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 149.0,
          "description": "min=149, mean=149, max=149, sum=149 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 55740.29530201342,
          "description": "min=55740.295, mean=55740.295, max=55740.295, sum=55740.295 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 423.0,
          "description": "min=423, mean=423, max=423, sum=423 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 292.11820330969266,
          "description": "min=292.118, mean=292.118, max=292.118, sum=292.118 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 915.0,
          "description": "min=915, mean=915, max=915, sum=915 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 777.151912568306,
          "description": "min=777.152, mean=777.152, max=777.152, sum=777.152 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 3668.27,
          "description": "min=3668.27, mean=3668.27, max=3668.27, sum=3668.27 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 120.0,
          "description": "min=120, mean=120, max=120, sum=120 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1674.0583333333334,
          "description": "min=1674.058, mean=1674.058, max=1674.058, sum=1674.058 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 128.0,
          "description": "min=128, mean=128, max=128, sum=128 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 309.640625,
          "description": "min=309.641, mean=309.641, max=309.641, sum=309.641 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 212.638,
          "description": "min=212.638, mean=212.638, max=212.638, sum=212.638 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 100.0,
          "description": "min=100, mean=100, max=100, sum=100 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 653.94,
          "description": "min=653.94, mean=653.94, max=653.94, sum=653.94 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 487.0,
          "description": "min=487, mean=487, max=487, sum=487 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 779.6509240246406,
          "description": "min=779.651, mean=779.651, max=779.651, sum=779.651 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 689.0,
          "description": "min=689, mean=689, max=689, sum=689 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 25.08708272859216,
          "description": "min=25.087, mean=25.087, max=25.087, sum=25.087 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 361.0,
          "description": "min=361, mean=361, max=361, sum=361 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 2571.130193905817,
          "description": "min=2571.13, mean=2571.13, max=2571.13, sum=2571.13 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=2000 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "med_dialog,subset=icliniq:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "med_dialog,subset=icliniq:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "med_dialog,subset=icliniq:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 267.246,
          "description": "min=244.906, mean=267.246, max=289.586, sum=534.492 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "med_dialog,subset=icliniq:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "med_dialog,subset=icliniq:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 2062.048,
          "description": "min=2062.048, mean=2062.048, max=2062.048, sum=2062.048 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 150.0,
          "description": "min=150, mean=150, max=150, sum=150 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 40.62,
          "description": "min=40.62, mean=40.62, max=40.62, sum=40.62 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 67.0,
          "description": "min=67, mean=67, max=67, sum=67 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 703.9402985074627,
          "description": "min=703.94, mean=703.94, max=703.94, sum=703.94 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 300.0,
          "description": "min=300, mean=300, max=300, sum=300 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 188.42333333333335,
          "description": "min=188.423, mean=188.423, max=188.423, sum=188.423 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 300.0,
          "description": "min=300, mean=300, max=300, sum=300 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 265.67,
          "description": "min=265.67, mean=265.67, max=265.67, sum=265.67 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 375.302,
          "description": "min=375.302, mean=375.302, max=375.302, sum=375.302 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1126.031,
          "description": "min=1126.031, mean=1126.031, max=1126.031, sum=1126.031 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 79.787,
          "description": "min=79.787, mean=79.787, max=79.787, sum=79.787 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 220.0,
          "description": "min=220, mean=220, max=220, sum=220 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 2041.9727272727273,
          "description": "min=2041.973, mean=2041.973, max=2041.973, sum=2041.973 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 167.0,
          "description": "min=167, mean=167, max=167, sum=167 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 490.29940119760477,
          "description": "min=490.299, mean=490.299, max=490.299, sum=490.299 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 86.0,
          "description": "min=86, mean=86, max=86, sum=258 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "n2c2_ct_matching:subject=CREATININE,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "n2c2_ct_matching:subject=CREATININE,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "n2c2_ct_matching:subject=CREATININE,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 5753.391472868217,
          "description": "min=5702.058, mean=5753.391, max=5830.058, sum=17260.174 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "n2c2_ct_matching:subject=CREATININE,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022",
            "n2c2_ct_matching:subject=CREATININE,model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 715.166,
          "description": "min=715.166, mean=715.166, max=715.166, sum=715.166 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 22974.077,
          "description": "min=22974.077, mean=22974.077, max=22974.077, sum=22974.077 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimiciv_billing_code:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "value": 327.0,
          "description": "min=327, mean=327, max=327, sum=327 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 40203.091743119265,
          "description": "min=40203.092, mean=40203.092, max=40203.092, sum=40203.092 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 63153.442,
          "description": "min=63153.442, mean=63153.442, max=63153.442, sum=63153.442 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 15388.833,
          "description": "min=15388.833, mean=15388.833, max=15388.833, sum=15388.833 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=anthropic_claude-3-5-sonnet-20241022,model_deployment=stanfordhealthcare_claude-3-5-sonnet-20241022"
          ]
        }
      ],
      [
        {
          "value": "Claude 3.7 Sonnet (20250219)",
          "description": "",
          "markdown": false
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 579.799,
          "description": "min=579.799, mean=579.799, max=579.799, sum=579.799 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 2.091,
          "description": "min=2.091, mean=2.091, max=2.091, sum=2.091 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 78.61538461538461,
          "description": "min=28, mean=78.615, max=100, sum=1022 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=chronic_pain,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=homelessness,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=liver_disease,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=major_depression,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=personality_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=unemployment,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=chronic_pain,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=homelessness,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=liver_disease,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=major_depression,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=personality_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=unemployment,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=chronic_pain,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=homelessness,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=liver_disease,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=major_depression,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=personality_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=unemployment,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 480.8413416196609,
          "description": "min=446.011, mean=480.841, max=525.657, sum=6250.937 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=chronic_pain,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=homelessness,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=liver_disease,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=major_depression,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=personality_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=unemployment,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=chronic_pain,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=homelessness,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=liver_disease,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=major_depression,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=personality_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "clear:condition=unemployment,max_eval_instances=100,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 427.0,
          "description": "min=427, mean=427, max=427, sum=427 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 794.0585480093677,
          "description": "min=794.059, mean=794.059, max=794.059, sum=794.059 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 597.0,
          "description": "min=597, mean=597, max=597, sum=597 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 320.5175879396985,
          "description": "min=320.518, mean=320.518, max=320.518, sum=320.518 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 17.23785594639866,
          "description": "min=17.238, mean=17.238, max=17.238, sum=17.238 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "ehrshot:subject=lab_anemia,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "ehrshot:subject=new_hypertension,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "ehrshot:subject=lab_anemia,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "ehrshot:subject=new_hypertension,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "ehrshot:subject=lab_anemia,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "ehrshot:subject=new_hypertension,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 33185.73533333334,
          "description": "min=29931.775, mean=33185.735, max=35478.742, sum=99557.206 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "ehrshot:subject=lab_anemia,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "ehrshot:subject=new_hypertension,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "ehrshot:subject=lab_anemia,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "ehrshot:subject=new_hypertension,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 163.022,
          "description": "min=163.022, mean=163.022, max=163.022, sum=163.022 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 308.0,
          "description": "min=308, mean=308, max=308, sum=308 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 345.6266233766234,
          "description": "min=345.627, mean=345.627, max=345.627, sum=345.627 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 149.0,
          "description": "min=149, mean=149, max=149, sum=149 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 55740.29530201342,
          "description": "min=55740.295, mean=55740.295, max=55740.295, sum=55740.295 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 423.0,
          "description": "min=423, mean=423, max=423, sum=423 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 292.11820330969266,
          "description": "min=292.118, mean=292.118, max=292.118, sum=292.118 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 915.0,
          "description": "min=915, mean=915, max=915, sum=915 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 777.151912568306,
          "description": "min=777.152, mean=777.152, max=777.152, sum=777.152 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 3668.27,
          "description": "min=3668.27, mean=3668.27, max=3668.27, sum=3668.27 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 120.0,
          "description": "min=120, mean=120, max=120, sum=120 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1674.0583333333334,
          "description": "min=1674.058, mean=1674.058, max=1674.058, sum=1674.058 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 128.0,
          "description": "min=128, mean=128, max=128, sum=128 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 309.640625,
          "description": "min=309.641, mean=309.641, max=309.641, sum=309.641 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 212.638,
          "description": "min=212.638, mean=212.638, max=212.638, sum=212.638 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 100.0,
          "description": "min=100, mean=100, max=100, sum=100 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 653.94,
          "description": "min=653.94, mean=653.94, max=653.94, sum=653.94 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 487.0,
          "description": "min=487, mean=487, max=487, sum=487 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 779.6509240246406,
          "description": "min=779.651, mean=779.651, max=779.651, sum=779.651 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 689.0,
          "description": "min=689, mean=689, max=689, sum=689 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 25.08708272859216,
          "description": "min=25.087, mean=25.087, max=25.087, sum=25.087 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 361.0,
          "description": "min=361, mean=361, max=361, sum=361 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 2571.130193905817,
          "description": "min=2571.13, mean=2571.13, max=2571.13, sum=2571.13 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=2000 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "med_dialog,subset=icliniq:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "med_dialog,subset=icliniq:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "med_dialog,subset=icliniq:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 267.246,
          "description": "min=244.906, mean=267.246, max=289.586, sum=534.492 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "med_dialog,subset=icliniq:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "med_dialog,subset=icliniq:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 2062.048,
          "description": "min=2062.048, mean=2062.048, max=2062.048, sum=2062.048 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 150.0,
          "description": "min=150, mean=150, max=150, sum=150 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 40.62,
          "description": "min=40.62, mean=40.62, max=40.62, sum=40.62 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 67.0,
          "description": "min=67, mean=67, max=67, sum=67 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 703.9402985074627,
          "description": "min=703.94, mean=703.94, max=703.94, sum=703.94 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 300.0,
          "description": "min=300, mean=300, max=300, sum=300 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 188.42333333333335,
          "description": "min=188.423, mean=188.423, max=188.423, sum=188.423 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 300.0,
          "description": "min=300, mean=300, max=300, sum=300 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 265.67,
          "description": "min=265.67, mean=265.67, max=265.67, sum=265.67 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 375.302,
          "description": "min=375.302, mean=375.302, max=375.302, sum=375.302 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1126.031,
          "description": "min=1126.031, mean=1126.031, max=1126.031, sum=1126.031 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 81.993,
          "description": "min=81.993, mean=81.993, max=81.993, sum=81.993 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 220.0,
          "description": "min=220, mean=220, max=220, sum=220 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 2041.9727272727273,
          "description": "min=2041.973, mean=2041.973, max=2041.973, sum=2041.973 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 167.0,
          "description": "min=167, mean=167, max=167, sum=167 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 490.29940119760477,
          "description": "min=490.299, mean=490.299, max=490.299, sum=490.299 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 86.0,
          "description": "min=86, mean=86, max=86, sum=258 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "n2c2_ct_matching:subject=CREATININE,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "n2c2_ct_matching:subject=CREATININE,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "n2c2_ct_matching:subject=CREATININE,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 5753.391472868217,
          "description": "min=5702.058, mean=5753.391, max=5830.058, sum=17260.174 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "n2c2_ct_matching:subject=CREATININE,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219",
            "n2c2_ct_matching:subject=CREATININE,model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 715.166,
          "description": "min=715.166, mean=715.166, max=715.166, sum=715.166 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 22974.077,
          "description": "min=22974.077, mean=22974.077, max=22974.077, sum=22974.077 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimiciv_billing_code:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "value": 327.0,
          "description": "min=327, mean=327, max=327, sum=327 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 40203.091743119265,
          "description": "min=40203.092, mean=40203.092, max=40203.092, sum=40203.092 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 63153.442,
          "description": "min=63153.442, mean=63153.442, max=63153.442, sum=63153.442 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 15388.833,
          "description": "min=15388.833, mean=15388.833, max=15388.833, sum=15388.833 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=anthropic_claude-3-7-sonnet-20250219,model_deployment=stanfordhealthcare_claude-3-7-sonnet-20250219"
          ]
        }
      ],
      [
        {
          "value": "DeepSeek R1",
          "description": "",
          "markdown": false
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 551.78,
          "description": "min=551.78, mean=551.78, max=551.78, sum=551.78 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 78.61538461538461,
          "description": "min=28, mean=78.615, max=100, sum=1022 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=bipolar_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=chronic_pain,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=homelessness,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=liver_disease,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=major_depression,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=personality_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=substance_use_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=suicidal_behavior,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=tobacco_dependence,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=unemployment,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=bipolar_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=chronic_pain,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=homelessness,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=liver_disease,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=major_depression,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=personality_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=substance_use_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=suicidal_behavior,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=tobacco_dependence,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=unemployment,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=bipolar_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=chronic_pain,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=homelessness,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=liver_disease,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=major_depression,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=personality_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=substance_use_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=suicidal_behavior,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=tobacco_dependence,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=unemployment,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 478.6682976038255,
          "description": "min=441.728, mean=478.668, max=512.09, sum=6222.688 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=bipolar_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=chronic_pain,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=homelessness,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=liver_disease,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=major_depression,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=personality_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=substance_use_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=suicidal_behavior,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=tobacco_dependence,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=unemployment,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.9976254180602008,
          "description": "min=0.989, mean=0.998, max=1, sum=12.969 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=bipolar_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=chronic_pain,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=homelessness,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=liver_disease,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=major_depression,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=personality_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=substance_use_disorder,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=suicidal_behavior,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=tobacco_dependence,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "clear:condition=unemployment,max_eval_instances=100,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 427.0,
          "description": "min=427, mean=427, max=427, sum=427 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 756.0468384074942,
          "description": "min=756.047, mean=756.047, max=756.047, sum=756.047 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 596.5573770491803,
          "description": "min=596.557, mean=596.557, max=596.557, sum=596.557 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 597.0,
          "description": "min=597, mean=597, max=597, sum=597 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 304.89447236180905,
          "description": "min=304.894, mean=304.894, max=304.894, sum=304.894 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "ehrshot:subject=lab_anemia,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "ehrshot:subject=new_hypertension,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "ehrshot:subject=lab_anemia,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "ehrshot:subject=new_hypertension,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "ehrshot:subject=lab_anemia,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "ehrshot:subject=new_hypertension,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 32781.75400000001,
          "description": "min=29560.991, mean=32781.754, max=35063.189, sum=98345.262 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "ehrshot:subject=lab_anemia,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "ehrshot:subject=new_hypertension,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.975,
          "description": "min=0.952, mean=0.975, max=0.994, sum=2.925 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "ehrshot:subject=lab_anemia,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "ehrshot:subject=new_hypertension,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 146.889,
          "description": "min=146.889, mean=146.889, max=146.889, sum=146.889 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 308.0,
          "description": "min=308, mean=308, max=308, sum=308 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 330.1655844155844,
          "description": "min=330.166, mean=330.166, max=330.166, sum=330.166 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 149.0,
          "description": "min=149, mean=149, max=149, sum=149 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 55318.20134228188,
          "description": "min=55318.201, mean=55318.201, max=55318.201, sum=55318.201 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 276.1275167785235,
          "description": "min=276.128, mean=276.128, max=276.128, sum=276.128 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 423.0,
          "description": "min=423, mean=423, max=423, sum=423 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 279.1985815602837,
          "description": "min=279.199, mean=279.199, max=279.199, sum=279.199 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 915.0,
          "description": "min=915, mean=915, max=915, sum=915 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 730.0043715846995,
          "description": "min=730.004, mean=730.004, max=730.004, sum=730.004 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.9989071038251366,
          "description": "min=0.999, mean=0.999, max=0.999, sum=0.999 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 3383.91,
          "description": "min=3383.91, mean=3383.91, max=3383.91, sum=3383.91 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 472.373,
          "description": "min=472.373, mean=472.373, max=472.373, sum=472.373 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 120.0,
          "description": "min=120, mean=120, max=120, sum=120 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1613.1666666666667,
          "description": "min=1613.167, mean=1613.167, max=1613.167, sum=1613.167 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 465.925,
          "description": "min=465.925, mean=465.925, max=465.925, sum=465.925 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 128.0,
          "description": "min=128, mean=128, max=128, sum=128 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 289.765625,
          "description": "min=289.766, mean=289.766, max=289.766, sum=289.766 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 590.8515625,
          "description": "min=590.852, mean=590.852, max=590.852, sum=590.852 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 199.291,
          "description": "min=199.291, mean=199.291, max=199.291, sum=199.291 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 94.074,
          "description": "min=94.074, mean=94.074, max=94.074, sum=94.074 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 100.0,
          "description": "min=100, mean=100, max=100, sum=100 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 635.72,
          "description": "min=635.72, mean=635.72, max=635.72, sum=635.72 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 149.09,
          "description": "min=149.09, mean=149.09, max=149.09, sum=149.09 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 487.0,
          "description": "min=487, mean=487, max=487, sum=487 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 721.3819301848049,
          "description": "min=721.382, mean=721.382, max=721.382, sum=721.382 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 127.06570841889118,
          "description": "min=127.066, mean=127.066, max=127.066, sum=127.066 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 689.0,
          "description": "min=689, mean=689, max=689, sum=689 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 22.378809869375907,
          "description": "min=22.379, mean=22.379, max=22.379, sum=22.379 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 285.86066763425254,
          "description": "min=285.861, mean=285.861, max=285.861, sum=285.861 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 361.0,
          "description": "min=361, mean=361, max=361, sum=361 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 2541.786703601108,
          "description": "min=2541.787, mean=2541.787, max=2541.787, sum=2541.787 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 699.0747922437673,
          "description": "min=699.075, mean=699.075, max=699.075, sum=699.075 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=2000 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "med_dialog,subset=icliniq:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "med_dialog,subset=icliniq:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "med_dialog,subset=icliniq:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 260.207,
          "description": "min=239.027, mean=260.207, max=281.387, sum=520.414 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "med_dialog,subset=icliniq:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 58.4075,
          "description": "min=57.567, mean=58.407, max=59.248, sum=116.815 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "med_dialog,subset=icliniq:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 2017.948,
          "description": "min=2017.948, mean=2017.948, max=2017.948, sum=2017.948 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 150.0,
          "description": "min=150, mean=150, max=150, sum=150 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 37.50666666666667,
          "description": "min=37.507, mean=37.507, max=37.507, sum=37.507 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 421.92,
          "description": "min=421.92, mean=421.92, max=421.92, sum=421.92 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 67.0,
          "description": "min=67, mean=67, max=67, sum=67 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 687.7014925373135,
          "description": "min=687.701, mean=687.701, max=687.701, sum=687.701 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 240.29850746268656,
          "description": "min=240.299, mean=240.299, max=240.299, sum=240.299 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 300.0,
          "description": "min=300, mean=300, max=300, sum=300 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 183.22666666666666,
          "description": "min=183.227, mean=183.227, max=183.227, sum=183.227 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 300.0,
          "description": "min=300, mean=300, max=300, sum=300 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 256.9033333333333,
          "description": "min=256.903, mean=256.903, max=256.903, sum=256.903 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 364.658,
          "description": "min=364.658, mean=364.658, max=364.658, sum=364.658 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1002.984,
          "description": "min=1002.984, mean=1002.984, max=1002.984, sum=1002.984 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 220.0,
          "description": "min=220, mean=220, max=220, sum=220 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 2042.3363636363636,
          "description": "min=2042.336, mean=2042.336, max=2042.336, sum=2042.336 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1.0363636363636364,
          "description": "min=1.036, mean=1.036, max=1.036, sum=1.036 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 167.0,
          "description": "min=167, mean=167, max=167, sum=167 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 466.94610778443115,
          "description": "min=466.946, mean=466.946, max=466.946, sum=466.946 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 86.0,
          "description": "min=86, mean=86, max=86, sum=258 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "n2c2_ct_matching:subject=ADVANCED-CAD,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "n2c2_ct_matching:subject=CREATININE,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "n2c2_ct_matching:subject=ADVANCED-CAD,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "n2c2_ct_matching:subject=CREATININE,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "n2c2_ct_matching:subject=ADVANCED-CAD,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "n2c2_ct_matching:subject=CREATININE,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 5437.860465116279,
          "description": "min=5387.86, mean=5437.86, max=5515.86, sum=16313.581 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "n2c2_ct_matching:subject=ADVANCED-CAD,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "n2c2_ct_matching:subject=CREATININE,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.9922480620155039,
          "description": "min=0.977, mean=0.992, max=1, sum=2.977 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "n2c2_ct_matching:subject=ADVANCED-CAD,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1",
            "n2c2_ct_matching:subject=CREATININE,num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 694.905,
          "description": "min=694.905, mean=694.905, max=694.905, sum=694.905 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 22653.558,
          "description": "min=22653.558, mean=22653.558, max=22653.558, sum=22653.558 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1.003,
          "description": "min=1.003, mean=1.003, max=1.003, sum=1.003 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimiciv_billing_code:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "value": 327.0,
          "description": "min=327, mean=327, max=327, sum=327 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 39468.477064220184,
          "description": "min=39468.477, mean=39468.477, max=39468.477, sum=39468.477 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1.003058103975535,
          "description": "min=1.003, mean=1.003, max=1.003, sum=1.003 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 62288.604,
          "description": "min=62288.604, mean=62288.604, max=62288.604, sum=62288.604 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1.017,
          "description": "min=1.017, mean=1.017, max=1.017, sum=1.017 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 15157.31,
          "description": "min=15157.31, mean=15157.31, max=15157.31, sum=15157.31 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        },
        {
          "value": 1.062,
          "description": "min=1.062, mean=1.062, max=1.062, sum=1.062 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:num_output_tokens=4000,model=deepseek-ai_deepseek-r1,model_deployment=stanfordhealthcare_deepseek-r1"
          ]
        }
      ],
      [
        {
          "value": "Gemini 1.5 Pro (001)",
          "description": "",
          "markdown": false
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 586.742,
          "description": "min=586.742, mean=586.742, max=586.742, sum=586.742 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 78.61538461538461,
          "description": "min=28, mean=78.615, max=100, sum=1022 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=chronic_pain,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=homelessness,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=liver_disease,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=major_depression,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=personality_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=unemployment,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=chronic_pain,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=homelessness,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=liver_disease,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=major_depression,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=personality_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=unemployment,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=chronic_pain,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=homelessness,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=liver_disease,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=major_depression,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=personality_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=unemployment,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 500.5226477055633,
          "description": "min=457.902, mean=500.523, max=535.478, sum=6506.794 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=chronic_pain,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=homelessness,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=liver_disease,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=major_depression,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=personality_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=unemployment,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=chronic_pain,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=homelessness,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=liver_disease,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=major_depression,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=personality_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "clear:condition=unemployment,max_eval_instances=100,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 427.0,
          "description": "min=427, mean=427, max=427, sum=427 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 759.2295081967213,
          "description": "min=759.23, mean=759.23, max=759.23, sum=759.23 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 597.0,
          "description": "min=597, mean=597, max=597, sum=597 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 327.09212730318256,
          "description": "min=327.092, mean=327.092, max=327.092, sum=327.092 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "ehrshot:subject=lab_anemia,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "ehrshot:subject=new_hypertension,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "ehrshot:subject=lab_anemia,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "ehrshot:subject=new_hypertension,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "ehrshot:subject=lab_anemia,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "ehrshot:subject=new_hypertension,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 43286.37100000001,
          "description": "min=38964.654, mean=43286.371, max=46266.454, sum=129859.113 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "ehrshot:subject=lab_anemia,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "ehrshot:subject=new_hypertension,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "ehrshot:subject=lab_anemia,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "ehrshot:subject=new_hypertension,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 153.899,
          "description": "min=153.899, mean=153.899, max=153.899, sum=153.899 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 308.0,
          "description": "min=308, mean=308, max=308, sum=308 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 347.45454545454544,
          "description": "min=347.455, mean=347.455, max=347.455, sum=347.455 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 149.0,
          "description": "min=149, mean=149, max=149, sum=149 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 60152.4899328859,
          "description": "min=60152.49, mean=60152.49, max=60152.49, sum=60152.49 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 423.0,
          "description": "min=423, mean=423, max=423, sum=423 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 277.21513002364065,
          "description": "min=277.215, mean=277.215, max=277.215, sum=277.215 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 915.0,
          "description": "min=915, mean=915, max=915, sum=915 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 740.8524590163935,
          "description": "min=740.852, mean=740.852, max=740.852, sum=740.852 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 3593.987,
          "description": "min=3593.987, mean=3593.987, max=3593.987, sum=3593.987 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 120.0,
          "description": "min=120, mean=120, max=120, sum=120 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1677.5333333333333,
          "description": "min=1677.533, mean=1677.533, max=1677.533, sum=1677.533 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 128.0,
          "description": "min=128, mean=128, max=128, sum=128 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 289.015625,
          "description": "min=289.016, mean=289.016, max=289.016, sum=289.016 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 197.6,
          "description": "min=197.6, mean=197.6, max=197.6, sum=197.6 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 100.0,
          "description": "min=100, mean=100, max=100, sum=100 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 615.41,
          "description": "min=615.41, mean=615.41, max=615.41, sum=615.41 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 487.0,
          "description": "min=487, mean=487, max=487, sum=487 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 736.747433264887,
          "description": "min=736.747, mean=736.747, max=736.747, sum=736.747 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 689.0,
          "description": "min=689, mean=689, max=689, sum=689 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 23.423802612481857,
          "description": "min=23.424, mean=23.424, max=23.424, sum=23.424 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 361.0,
          "description": "min=361, mean=361, max=361, sum=361 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 2606.409972299169,
          "description": "min=2606.41, mean=2606.41, max=2606.41, sum=2606.41 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=2000 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "med_dialog,subset=icliniq:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "med_dialog,subset=icliniq:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "med_dialog,subset=icliniq:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 261.433,
          "description": "min=238.927, mean=261.433, max=283.939, sum=522.866 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "med_dialog,subset=icliniq:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "med_dialog,subset=icliniq:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 2087.742,
          "description": "min=2087.742, mean=2087.742, max=2087.742, sum=2087.742 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 150.0,
          "description": "min=150, mean=150, max=150, sum=150 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 38.906666666666666,
          "description": "min=38.907, mean=38.907, max=38.907, sum=38.907 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 67.0,
          "description": "min=67, mean=67, max=67, sum=67 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 703.044776119403,
          "description": "min=703.045, mean=703.045, max=703.045, sum=703.045 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 300.0,
          "description": "min=300, mean=300, max=300, sum=300 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 187.90666666666667,
          "description": "min=187.907, mean=187.907, max=187.907, sum=187.907 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 300.0,
          "description": "min=300, mean=300, max=300, sum=300 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 265.4066666666667,
          "description": "min=265.407, mean=265.407, max=265.407, sum=265.407 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 383.966,
          "description": "min=383.966, mean=383.966, max=383.966, sum=383.966 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1112.729,
          "description": "min=1112.729, mean=1112.729, max=1112.729, sum=1112.729 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 220.0,
          "description": "min=220, mean=220, max=220, sum=220 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 2155.181818181818,
          "description": "min=2155.182, mean=2155.182, max=2155.182, sum=2155.182 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 167.0,
          "description": "min=167, mean=167, max=167, sum=167 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 490.7544910179641,
          "description": "min=490.754, mean=490.754, max=490.754, sum=490.754 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 86.0,
          "description": "min=86, mean=86, max=86, sum=258 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "n2c2_ct_matching:subject=CREATININE,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "n2c2_ct_matching:subject=CREATININE,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "n2c2_ct_matching:subject=CREATININE,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 5833.290697674419,
          "description": "min=5783.291, mean=5833.291, max=5908.291, sum=17499.872 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "n2c2_ct_matching:subject=CREATININE,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001",
            "n2c2_ct_matching:subject=CREATININE,model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 718.045,
          "description": "min=718.045, mean=718.045, max=718.045, sum=718.045 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 23792.148,
          "description": "min=23792.148, mean=23792.148, max=23792.148, sum=23792.148 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimiciv_billing_code:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "value": 327.0,
          "description": "min=327, mean=327, max=327, sum=327 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 41037.605504587154,
          "description": "min=41037.606, mean=41037.606, max=41037.606, sum=41037.606 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 65242.146,
          "description": "min=65242.146, mean=65242.146, max=65242.146, sum=65242.146 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 15723.329,
          "description": "min=15723.329, mean=15723.329, max=15723.329, sum=15723.329 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=google_gemini-1.5-pro-001,model_deployment=stanfordhealthcare_gemini-1.5-pro-001"
          ]
        }
      ],
      [
        {
          "value": "Gemini 2.0 Flash",
          "description": "",
          "markdown": false
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 586.742,
          "description": "min=586.742, mean=586.742, max=586.742, sum=586.742 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 78.61538461538461,
          "description": "min=28, mean=78.615, max=100, sum=1022 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=chronic_pain,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=homelessness,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=liver_disease,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=major_depression,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=personality_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=unemployment,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=chronic_pain,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=homelessness,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=liver_disease,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=major_depression,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=personality_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=unemployment,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=chronic_pain,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=homelessness,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=liver_disease,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=major_depression,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=personality_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=unemployment,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 500.5226477055633,
          "description": "min=457.902, mean=500.523, max=535.478, sum=6506.794 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=chronic_pain,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=homelessness,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=liver_disease,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=major_depression,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=personality_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=unemployment,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=chronic_pain,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=homelessness,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=liver_disease,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=major_depression,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=personality_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "clear:condition=unemployment,max_eval_instances=100,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 427.0,
          "description": "min=427, mean=427, max=427, sum=427 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 759.2295081967213,
          "description": "min=759.23, mean=759.23, max=759.23, sum=759.23 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 597.0,
          "description": "min=597, mean=597, max=597, sum=597 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 327.09212730318256,
          "description": "min=327.092, mean=327.092, max=327.092, sum=327.092 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "ehrshot:subject=lab_anemia,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "ehrshot:subject=new_hypertension,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "ehrshot:subject=lab_anemia,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "ehrshot:subject=new_hypertension,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "ehrshot:subject=lab_anemia,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "ehrshot:subject=new_hypertension,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 43286.37100000001,
          "description": "min=38964.654, mean=43286.371, max=46266.454, sum=129859.113 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "ehrshot:subject=lab_anemia,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "ehrshot:subject=new_hypertension,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "ehrshot:subject=lab_anemia,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "ehrshot:subject=new_hypertension,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 153.899,
          "description": "min=153.899, mean=153.899, max=153.899, sum=153.899 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 308.0,
          "description": "min=308, mean=308, max=308, sum=308 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 347.45454545454544,
          "description": "min=347.455, mean=347.455, max=347.455, sum=347.455 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 149.0,
          "description": "min=149, mean=149, max=149, sum=149 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 60152.4899328859,
          "description": "min=60152.49, mean=60152.49, max=60152.49, sum=60152.49 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 423.0,
          "description": "min=423, mean=423, max=423, sum=423 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 278.21513002364065,
          "description": "min=278.215, mean=278.215, max=278.215, sum=278.215 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 915.0,
          "description": "min=915, mean=915, max=915, sum=915 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 741.8524590163935,
          "description": "min=741.852, mean=741.852, max=741.852, sum=741.852 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 3593.987,
          "description": "min=3593.987, mean=3593.987, max=3593.987, sum=3593.987 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 120.0,
          "description": "min=120, mean=120, max=120, sum=120 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 1677.5333333333333,
          "description": "min=1677.533, mean=1677.533, max=1677.533, sum=1677.533 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 128.0,
          "description": "min=128, mean=128, max=128, sum=128 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 289.015625,
          "description": "min=289.016, mean=289.016, max=289.016, sum=289.016 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 197.6,
          "description": "min=197.6, mean=197.6, max=197.6, sum=197.6 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 100.0,
          "description": "min=100, mean=100, max=100, sum=100 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 615.41,
          "description": "min=615.41, mean=615.41, max=615.41, sum=615.41 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 487.0,
          "description": "min=487, mean=487, max=487, sum=487 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 736.747433264887,
          "description": "min=736.747, mean=736.747, max=736.747, sum=736.747 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 689.0,
          "description": "min=689, mean=689, max=689, sum=689 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 23.423802612481857,
          "description": "min=23.424, mean=23.424, max=23.424, sum=23.424 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 361.0,
          "description": "min=361, mean=361, max=361, sum=361 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 2606.409972299169,
          "description": "min=2606.41, mean=2606.41, max=2606.41, sum=2606.41 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=2000 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "med_dialog,subset=icliniq:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "med_dialog,subset=icliniq:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "med_dialog,subset=icliniq:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 261.433,
          "description": "min=238.927, mean=261.433, max=283.939, sum=522.866 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "med_dialog,subset=icliniq:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "med_dialog,subset=icliniq:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 2089.742,
          "description": "min=2089.742, mean=2089.742, max=2089.742, sum=2089.742 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 150.0,
          "description": "min=150, mean=150, max=150, sum=150 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 38.906666666666666,
          "description": "min=38.907, mean=38.907, max=38.907, sum=38.907 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 67.0,
          "description": "min=67, mean=67, max=67, sum=67 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 703.044776119403,
          "description": "min=703.045, mean=703.045, max=703.045, sum=703.045 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 300.0,
          "description": "min=300, mean=300, max=300, sum=300 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 187.90666666666667,
          "description": "min=187.907, mean=187.907, max=187.907, sum=187.907 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 300.0,
          "description": "min=300, mean=300, max=300, sum=300 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 265.4066666666667,
          "description": "min=265.407, mean=265.407, max=265.407, sum=265.407 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 383.966,
          "description": "min=383.966, mean=383.966, max=383.966, sum=383.966 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 1112.729,
          "description": "min=1112.729, mean=1112.729, max=1112.729, sum=1112.729 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 220.0,
          "description": "min=220, mean=220, max=220, sum=220 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 2157.181818181818,
          "description": "min=2157.182, mean=2157.182, max=2157.182, sum=2157.182 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 167.0,
          "description": "min=167, mean=167, max=167, sum=167 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 490.7544910179641,
          "description": "min=490.754, mean=490.754, max=490.754, sum=490.754 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 86.0,
          "description": "min=86, mean=86, max=86, sum=258 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "n2c2_ct_matching:subject=CREATININE,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "n2c2_ct_matching:subject=CREATININE,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "n2c2_ct_matching:subject=CREATININE,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 5833.290697674419,
          "description": "min=5783.291, mean=5833.291, max=5908.291, sum=17499.872 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "n2c2_ct_matching:subject=CREATININE,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001",
            "n2c2_ct_matching:subject=CREATININE,model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 718.045,
          "description": "min=718.045, mean=718.045, max=718.045, sum=718.045 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 23794.148,
          "description": "min=23794.148, mean=23794.148, max=23794.148, sum=23794.148 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimiciv_billing_code:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "value": 327.0,
          "description": "min=327, mean=327, max=327, sum=327 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 41039.605504587154,
          "description": "min=41039.606, mean=41039.606, max=41039.606, sum=41039.606 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 65244.146,
          "description": "min=65244.146, mean=65244.146, max=65244.146, sum=65244.146 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 15725.329,
          "description": "min=15725.329, mean=15725.329, max=15725.329, sum=15725.329 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=google_gemini-2.0-flash-001,model_deployment=stanfordhealthcare_gemini-2.0-flash-001"
          ]
        }
      ],
      [
        {
          "value": "GPT-4o (2024-05-13)",
          "description": "",
          "markdown": false
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 570.684,
          "description": "min=570.684, mean=570.684, max=570.684, sum=570.684 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.904,
          "description": "min=1.904, mean=1.904, max=1.904, sum=1.904 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 78.61538461538461,
          "description": "min=28, mean=78.615, max=100, sum=1022 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=chronic_pain,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=homelessness,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=liver_disease,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=major_depression,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=personality_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=unemployment,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=chronic_pain,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=homelessness,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=liver_disease,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=major_depression,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=personality_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=unemployment,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=chronic_pain,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=homelessness,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=liver_disease,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=major_depression,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=personality_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=unemployment,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 483.5058692520385,
          "description": "min=443.989, mean=483.506, max=519.567, sum=6285.576 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=chronic_pain,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=homelessness,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=liver_disease,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=major_depression,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=personality_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=unemployment,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.9984615384615385,
          "description": "min=0.99, mean=0.998, max=1, sum=12.98 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=chronic_pain,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=homelessness,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=liver_disease,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=major_depression,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=personality_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "clear:condition=unemployment,max_eval_instances=100,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 427.0,
          "description": "min=427, mean=427, max=427, sum=427 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 771.903981264637,
          "description": "min=771.904, mean=771.904, max=771.904, sum=771.904 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 450.1217798594848,
          "description": "min=450.122, mean=450.122, max=450.122, sum=450.122 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 597.0,
          "description": "min=597, mean=597, max=597, sum=597 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 308.2579564489112,
          "description": "min=308.258, mean=308.258, max=308.258, sum=308.258 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 19.77721943048576,
          "description": "min=19.777, mean=19.777, max=19.777, sum=19.777 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "ehrshot:subject=lab_anemia,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "ehrshot:subject=new_hypertension,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "ehrshot:subject=lab_anemia,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "ehrshot:subject=new_hypertension,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "ehrshot:subject=lab_anemia,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "ehrshot:subject=new_hypertension,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 35676.975666666665,
          "description": "min=32182.049, mean=35676.976, max=38197.948, sum=107030.927 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "ehrshot:subject=lab_anemia,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "ehrshot:subject=new_hypertension,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=3 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "ehrshot:subject=lab_anemia,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "ehrshot:subject=new_hypertension,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 149.213,
          "description": "min=149.213, mean=149.213, max=149.213, sum=149.213 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 308.0,
          "description": "min=308, mean=308, max=308, sum=308 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 332.262987012987,
          "description": "min=332.263, mean=332.263, max=332.263, sum=332.263 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 149.0,
          "description": "min=149, mean=149, max=149, sum=149 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 55486.93288590604,
          "description": "min=55486.933, mean=55486.933, max=55486.933, sum=55486.933 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 154.68456375838926,
          "description": "min=154.685, mean=154.685, max=154.685, sum=154.685 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 423.0,
          "description": "min=423, mean=423, max=423, sum=423 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 280.2434988179669,
          "description": "min=280.243, mean=280.243, max=280.243, sum=280.243 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 915.0,
          "description": "min=915, mean=915, max=915, sum=915 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 735.6131147540983,
          "description": "min=735.613, mean=735.613, max=735.613, sum=735.613 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 3390.381,
          "description": "min=3390.381, mean=3390.381, max=3390.381, sum=3390.381 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 299.734,
          "description": "min=299.734, mean=299.734, max=299.734, sum=299.734 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 120.0,
          "description": "min=120, mean=120, max=120, sum=120 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1573.6416666666667,
          "description": "min=1573.642, mean=1573.642, max=1573.642, sum=1573.642 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 434.18333333333334,
          "description": "min=434.183, mean=434.183, max=434.183, sum=434.183 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 128.0,
          "description": "min=128, mean=128, max=128, sum=128 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 297.8984375,
          "description": "min=297.898, mean=297.898, max=297.898, sum=297.898 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 506.8984375,
          "description": "min=506.898, mean=506.898, max=506.898, sum=506.898 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 207.052,
          "description": "min=207.052, mean=207.052, max=207.052, sum=207.052 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 91.136,
          "description": "min=91.136, mean=91.136, max=91.136, sum=91.136 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 100.0,
          "description": "min=100, mean=100, max=100, sum=100 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 624.83,
          "description": "min=624.83, mean=624.83, max=624.83, sum=624.83 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 217.87,
          "description": "min=217.87, mean=217.87, max=217.87, sum=217.87 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 487.0,
          "description": "min=487, mean=487, max=487, sum=487 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 707.9425051334703,
          "description": "min=707.943, mean=707.943, max=707.943, sum=707.943 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 211.07186858316223,
          "description": "min=211.072, mean=211.072, max=211.072, sum=211.072 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 689.0,
          "description": "min=689, mean=689, max=689, sum=689 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 22.355587808417997,
          "description": "min=22.356, mean=22.356, max=22.356, sum=22.356 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 202.63570391872278,
          "description": "min=202.636, mean=202.636, max=202.636, sum=202.636 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 361.0,
          "description": "min=361, mean=361, max=361, sum=361 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 2567.2714681440443,
          "description": "min=2567.271, mean=2567.271, max=2567.271, sum=2567.271 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 256.0,
          "description": "min=256, mean=256, max=256, sum=256 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=2000 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "med_dialog,subset=icliniq:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "med_dialog,subset=icliniq:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "med_dialog,subset=icliniq:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 258.995,
          "description": "min=236.85, mean=258.995, max=281.14, sum=517.99 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "med_dialog,subset=icliniq:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 48.3285,
          "description": "min=47.717, mean=48.328, max=48.94, sum=96.657 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "med_dialog,subset=icliniq:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 2042.08,
          "description": "min=2042.08, mean=2042.08, max=2042.08, sum=2042.08 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 150.0,
          "description": "min=150, mean=150, max=150, sum=150 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 37.626666666666665,
          "description": "min=37.627, mean=37.627, max=37.627, sum=37.627 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 369.38,
          "description": "min=369.38, mean=369.38, max=369.38, sum=369.38 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 67.0,
          "description": "min=67, mean=67, max=67, sum=67 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 669.8059701492538,
          "description": "min=669.806, mean=669.806, max=669.806, sum=669.806 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 94.14925373134328,
          "description": "min=94.149, mean=94.149, max=94.149, sum=94.149 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 300.0,
          "description": "min=300, mean=300, max=300, sum=300 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 179.92,
          "description": "min=179.92, mean=179.92, max=179.92, sum=179.92 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 300.0,
          "description": "min=300, mean=300, max=300, sum=300 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 254.18666666666667,
          "description": "min=254.187, mean=254.187, max=254.187, sum=254.187 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 371.822,
          "description": "min=371.822, mean=371.822, max=371.822, sum=371.822 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 978.287,
          "description": "min=978.287, mean=978.287, max=978.287, sum=978.287 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 48.703,
          "description": "min=48.703, mean=48.703, max=48.703, sum=48.703 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 220.0,
          "description": "min=220, mean=220, max=220, sum=220 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 2071.9545454545455,
          "description": "min=2071.955, mean=2071.955, max=2071.955, sum=2071.955 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 167.0,
          "description": "min=167, mean=167, max=167, sum=167 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 468.88023952095807,
          "description": "min=468.88, mean=468.88, max=468.88, sum=468.88 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 86.0,
          "description": "min=86, mean=86, max=86, sum=258 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "n2c2_ct_matching:subject=CREATININE,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "n2c2_ct_matching:subject=CREATININE,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "n2c2_ct_matching:subject=CREATININE,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 5477.837209302325,
          "description": "min=5428.837, mean=5477.837, max=5553.837, sum=16433.512 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "n2c2_ct_matching:subject=CREATININE,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=3 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13",
            "n2c2_ct_matching:subject=CREATININE,model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 701.975,
          "description": "min=701.975, mean=701.975, max=701.975, sum=701.975 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 22903.633,
          "description": "min=22903.633, mean=22903.633, max=22903.633, sum=22903.633 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimiciv_billing_code:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "value": 327.0,
          "description": "min=327, mean=327, max=327, sum=327 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 39603.12844036697,
          "description": "min=39603.128, mean=39603.128, max=39603.128, sum=39603.128 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 62667.633,
          "description": "min=62667.633, mean=62667.633, max=62667.633, sum=62667.633 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 15195.841,
          "description": "min=15195.841, mean=15195.841, max=15195.841, sum=15195.841 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=openai_gpt-4o-2024-05-13,model_deployment=stanfordhealthcare_gpt-4o-2024-05-13"
          ]
        }
      ],
      [
        {
          "value": "GPT-4o mini (2024-07-18)",
          "description": "",
          "markdown": false
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 570.684,
          "description": "min=570.684, mean=570.684, max=570.684, sum=570.684 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 2.071,
          "description": "min=2.071, mean=2.071, max=2.071, sum=2.071 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 78.61538461538461,
          "description": "min=28, mean=78.615, max=100, sum=1022 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=chronic_pain,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=homelessness,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=liver_disease,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=major_depression,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=personality_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=unemployment,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=chronic_pain,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=homelessness,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=liver_disease,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=major_depression,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=personality_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=unemployment,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=chronic_pain,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=homelessness,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=liver_disease,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=major_depression,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=personality_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=unemployment,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 483.5058692520385,
          "description": "min=443.989, mean=483.506, max=519.567, sum=6285.576 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=chronic_pain,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=homelessness,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=liver_disease,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=major_depression,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=personality_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=unemployment,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.9984615384615385,
          "description": "min=0.99, mean=0.998, max=1, sum=12.98 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=chronic_pain,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=homelessness,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=liver_disease,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=major_depression,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=personality_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "clear:condition=unemployment,max_eval_instances=100,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 427.0,
          "description": "min=427, mean=427, max=427, sum=427 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 771.903981264637,
          "description": "min=771.904, mean=771.904, max=771.904, sum=771.904 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 504.9695550351288,
          "description": "min=504.97, mean=504.97, max=504.97, sum=504.97 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 597.0,
          "description": "min=597, mean=597, max=597, sum=597 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 308.2579564489112,
          "description": "min=308.258, mean=308.258, max=308.258, sum=308.258 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 25.284757118927974,
          "description": "min=25.285, mean=25.285, max=25.285, sum=25.285 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "ehrshot:subject=lab_anemia,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "ehrshot:subject=new_hypertension,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "ehrshot:subject=lab_anemia,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "ehrshot:subject=new_hypertension,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "ehrshot:subject=lab_anemia,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "ehrshot:subject=new_hypertension,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 35676.975666666665,
          "description": "min=32182.049, mean=35676.976, max=38197.948, sum=107030.927 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "ehrshot:subject=lab_anemia,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "ehrshot:subject=new_hypertension,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=3 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "ehrshot:subject=lab_anemia,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "ehrshot:subject=new_hypertension,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 149.213,
          "description": "min=149.213, mean=149.213, max=149.213, sum=149.213 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 308.0,
          "description": "min=308, mean=308, max=308, sum=308 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 332.262987012987,
          "description": "min=332.263, mean=332.263, max=332.263, sum=332.263 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 149.0,
          "description": "min=149, mean=149, max=149, sum=149 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 55486.93288590604,
          "description": "min=55486.933, mean=55486.933, max=55486.933, sum=55486.933 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 160.46979865771812,
          "description": "min=160.47, mean=160.47, max=160.47, sum=160.47 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 423.0,
          "description": "min=423, mean=423, max=423, sum=423 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 280.2434988179669,
          "description": "min=280.243, mean=280.243, max=280.243, sum=280.243 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 915.0,
          "description": "min=915, mean=915, max=915, sum=915 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 735.6131147540983,
          "description": "min=735.613, mean=735.613, max=735.613, sum=735.613 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 3390.381,
          "description": "min=3390.381, mean=3390.381, max=3390.381, sum=3390.381 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 295.109,
          "description": "min=295.109, mean=295.109, max=295.109, sum=295.109 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 120.0,
          "description": "min=120, mean=120, max=120, sum=120 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1573.6416666666667,
          "description": "min=1573.642, mean=1573.642, max=1573.642, sum=1573.642 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 427.8333333333333,
          "description": "min=427.833, mean=427.833, max=427.833, sum=427.833 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 128.0,
          "description": "min=128, mean=128, max=128, sum=128 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 297.8984375,
          "description": "min=297.898, mean=297.898, max=297.898, sum=297.898 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 505.53125,
          "description": "min=505.531, mean=505.531, max=505.531, sum=505.531 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 207.052,
          "description": "min=207.052, mean=207.052, max=207.052, sum=207.052 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 90.395,
          "description": "min=90.395, mean=90.395, max=90.395, sum=90.395 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 100.0,
          "description": "min=100, mean=100, max=100, sum=100 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 624.83,
          "description": "min=624.83, mean=624.83, max=624.83, sum=624.83 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 139.81,
          "description": "min=139.81, mean=139.81, max=139.81, sum=139.81 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 487.0,
          "description": "min=487, mean=487, max=487, sum=487 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 707.9425051334703,
          "description": "min=707.943, mean=707.943, max=707.943, sum=707.943 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 206.6447638603696,
          "description": "min=206.645, mean=206.645, max=206.645, sum=206.645 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 689.0,
          "description": "min=689, mean=689, max=689, sum=689 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 22.355587808417997,
          "description": "min=22.356, mean=22.356, max=22.356, sum=22.356 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 184.02031930333817,
          "description": "min=184.02, mean=184.02, max=184.02, sum=184.02 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 361.0,
          "description": "min=361, mean=361, max=361, sum=361 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 2567.2714681440443,
          "description": "min=2567.271, mean=2567.271, max=2567.271, sum=2567.271 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 256.0,
          "description": "min=256, mean=256, max=256, sum=256 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=2000 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "med_dialog,subset=icliniq:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "med_dialog,subset=icliniq:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "med_dialog,subset=icliniq:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 258.995,
          "description": "min=236.85, mean=258.995, max=281.14, sum=517.99 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "med_dialog,subset=icliniq:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 44.437,
          "description": "min=43.741, mean=44.437, max=45.133, sum=88.874 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "med_dialog,subset=icliniq:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 2042.08,
          "description": "min=2042.08, mean=2042.08, max=2042.08, sum=2042.08 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 150.0,
          "description": "min=150, mean=150, max=150, sum=150 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 37.626666666666665,
          "description": "min=37.627, mean=37.627, max=37.627, sum=37.627 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 339.56666666666666,
          "description": "min=339.567, mean=339.567, max=339.567, sum=339.567 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 67.0,
          "description": "min=67, mean=67, max=67, sum=67 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 669.8059701492538,
          "description": "min=669.806, mean=669.806, max=669.806, sum=669.806 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 100.0,
          "description": "min=100, mean=100, max=100, sum=100 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 300.0,
          "description": "min=300, mean=300, max=300, sum=300 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 179.92,
          "description": "min=179.92, mean=179.92, max=179.92, sum=179.92 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 300.0,
          "description": "min=300, mean=300, max=300, sum=300 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 254.18666666666667,
          "description": "min=254.187, mean=254.187, max=254.187, sum=254.187 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 371.822,
          "description": "min=371.822, mean=371.822, max=371.822, sum=371.822 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 978.287,
          "description": "min=978.287, mean=978.287, max=978.287, sum=978.287 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 66.542,
          "description": "min=66.542, mean=66.542, max=66.542, sum=66.542 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 220.0,
          "description": "min=220, mean=220, max=220, sum=220 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 2071.9545454545455,
          "description": "min=2071.955, mean=2071.955, max=2071.955, sum=2071.955 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 167.0,
          "description": "min=167, mean=167, max=167, sum=167 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 468.88023952095807,
          "description": "min=468.88, mean=468.88, max=468.88, sum=468.88 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 86.0,
          "description": "min=86, mean=86, max=86, sum=258 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "n2c2_ct_matching:subject=CREATININE,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "n2c2_ct_matching:subject=CREATININE,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "n2c2_ct_matching:subject=CREATININE,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 5477.837209302325,
          "description": "min=5428.837, mean=5477.837, max=5553.837, sum=16433.512 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "n2c2_ct_matching:subject=CREATININE,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=3 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18",
            "n2c2_ct_matching:subject=CREATININE,model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 701.975,
          "description": "min=701.975, mean=701.975, max=701.975, sum=701.975 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 22903.633,
          "description": "min=22903.633, mean=22903.633, max=22903.633, sum=22903.633 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimiciv_billing_code:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "value": 327.0,
          "description": "min=327, mean=327, max=327, sum=327 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 39603.12844036697,
          "description": "min=39603.128, mean=39603.128, max=39603.128, sum=39603.128 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 62667.633,
          "description": "min=62667.633, mean=62667.633, max=62667.633, sum=62667.633 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 15195.841,
          "description": "min=15195.841, mean=15195.841, max=15195.841, sum=15195.841 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=openai_gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare_gpt-4o-mini-2024-07-18"
          ]
        }
      ],
      [
        {
          "value": "Llama 3.3 Instruct (70B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 584.571,
          "description": "min=584.571, mean=584.571, max=584.571, sum=584.571 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 4.27,
          "description": "min=4.27, mean=4.27, max=4.27, sum=4.27 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 78.61538461538461,
          "description": "min=28, mean=78.615, max=100, sum=1022 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=chronic_pain,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=homelessness,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=liver_disease,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=major_depression,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=personality_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=unemployment,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=chronic_pain,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=homelessness,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=liver_disease,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=major_depression,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=personality_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=unemployment,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=chronic_pain,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=homelessness,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=liver_disease,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=major_depression,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=personality_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=unemployment,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 489.01692165679975,
          "description": "min=447.424, mean=489.017, max=525.955, sum=6357.22 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=chronic_pain,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=homelessness,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=liver_disease,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=major_depression,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=personality_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=unemployment,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.9984615384615385,
          "description": "min=0.99, mean=0.998, max=1, sum=12.98 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=bipolar_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=chronic_pain,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=homelessness,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=liver_disease,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=major_depression,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=personality_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=substance_use_disorder,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=suicidal_behavior,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=tobacco_dependence,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "clear:condition=unemployment,max_eval_instances=100,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 427.0,
          "description": "min=427, mean=427, max=427, sum=427 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 788.0538641686182,
          "description": "min=788.054, mean=788.054, max=788.054, sum=788.054 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 443.0234192037471,
          "description": "min=443.023, mean=443.023, max=443.023, sum=443.023 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 597.0,
          "description": "min=597, mean=597, max=597, sum=597 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 311.5812395309883,
          "description": "min=311.581, mean=311.581, max=311.581, sum=311.581 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 171.64489112227807,
          "description": "min=171.645, mean=171.645, max=171.645, sum=171.645 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "ehrshot:subject=lab_anemia,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "ehrshot:subject=new_hypertension,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "ehrshot:subject=lab_anemia,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "ehrshot:subject=new_hypertension,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "ehrshot:subject=lab_anemia,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "ehrshot:subject=new_hypertension,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 32868.43833333333,
          "description": "min=29647.98, mean=32868.438, max=35141.782, sum=98605.315 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "ehrshot:subject=lab_anemia,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "ehrshot:subject=new_hypertension,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=3 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "ehrshot:subject=lab_anemia,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "ehrshot:subject=new_hypertension,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 150.567,
          "description": "min=150.567, mean=150.567, max=150.567, sum=150.567 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 308.0,
          "description": "min=308, mean=308, max=308, sum=308 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 337.39935064935065,
          "description": "min=337.399, mean=337.399, max=337.399, sum=337.399 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 149.0,
          "description": "min=149, mean=149, max=149, sum=149 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 55932.718120805366,
          "description": "min=55932.718, mean=55932.718, max=55932.718, sum=55932.718 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 138.00671140939596,
          "description": "min=138.007, mean=138.007, max=138.007, sum=138.007 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 423.0,
          "description": "min=423, mean=423, max=423, sum=423 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 283.0118203309693,
          "description": "min=283.012, mean=283.012, max=283.012, sum=283.012 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 915.0,
          "description": "min=915, mean=915, max=915, sum=915 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 746.248087431694,
          "description": "min=746.248, mean=746.248, max=746.248, sum=746.248 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 3471.207,
          "description": "min=3471.207, mean=3471.207, max=3471.207, sum=3471.207 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 295.329,
          "description": "min=295.329, mean=295.329, max=295.329, sum=295.329 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 120.0,
          "description": "min=120, mean=120, max=120, sum=120 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1629.5833333333333,
          "description": "min=1629.583, mean=1629.583, max=1629.583, sum=1629.583 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 428.6666666666667,
          "description": "min=428.667, mean=428.667, max=428.667, sum=428.667 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 128.0,
          "description": "min=128, mean=128, max=128, sum=128 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 304.1640625,
          "description": "min=304.164, mean=304.164, max=304.164, sum=304.164 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 456.828125,
          "description": "min=456.828, mean=456.828, max=456.828, sum=456.828 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 210.318,
          "description": "min=210.318, mean=210.318, max=210.318, sum=210.318 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 54.236,
          "description": "min=54.236, mean=54.236, max=54.236, sum=54.236 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 100.0,
          "description": "min=100, mean=100, max=100, sum=100 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 643.16,
          "description": "min=643.16, mean=643.16, max=643.16, sum=643.16 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 118.91,
          "description": "min=118.91, mean=118.91, max=118.91, sum=118.91 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 487.0,
          "description": "min=487, mean=487, max=487, sum=487 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 723.6078028747434,
          "description": "min=723.608, mean=723.608, max=723.608, sum=723.608 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 167.82340862422998,
          "description": "min=167.823, mean=167.823, max=167.823, sum=167.823 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 689.0,
          "description": "min=689, mean=689, max=689, sum=689 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 22.6966618287373,
          "description": "min=22.697, mean=22.697, max=22.697, sum=22.697 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 295.51959361393324,
          "description": "min=295.52, mean=295.52, max=295.52, sum=295.52 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 361.0,
          "description": "min=361, mean=361, max=361, sum=361 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 2616.9639889196674,
          "description": "min=2616.964, mean=2616.964, max=2616.964, sum=2616.964 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 255.98060941828254,
          "description": "min=255.981, mean=255.981, max=255.981, sum=255.981 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=2000 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "med_dialog,subset=icliniq:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "med_dialog,subset=icliniq:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "med_dialog,subset=icliniq:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 261.095,
          "description": "min=239.772, mean=261.095, max=282.418, sum=522.19 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "med_dialog,subset=icliniq:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 51.19,
          "description": "min=50.736, mean=51.19, max=51.644, sum=102.38 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "med_dialog,subset=icliniq:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 2070.752,
          "description": "min=2070.752, mean=2070.752, max=2070.752, sum=2070.752 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 150.0,
          "description": "min=150, mean=150, max=150, sum=150 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 38.16,
          "description": "min=38.16, mean=38.16, max=38.16, sum=38.16 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 486.67333333333335,
          "description": "min=486.673, mean=486.673, max=486.673, sum=486.673 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 67.0,
          "description": "min=67, mean=67, max=67, sum=67 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 685.4179104477612,
          "description": "min=685.418, mean=685.418, max=685.418, sum=685.418 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 100.04477611940298,
          "description": "min=100.045, mean=100.045, max=100.045, sum=100.045 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 300.0,
          "description": "min=300, mean=300, max=300, sum=300 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 183.76,
          "description": "min=183.76, mean=183.76, max=183.76, sum=183.76 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 300.0,
          "description": "min=300, mean=300, max=300, sum=300 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 257.2866666666667,
          "description": "min=257.287, mean=257.287, max=257.287, sum=257.287 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 376.774,
          "description": "min=376.774, mean=376.774, max=376.774, sum=376.774 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1.102,
          "description": "min=1.102, mean=1.102, max=1.102, sum=1.102 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 962.504,
          "description": "min=962.504, mean=962.504, max=962.504, sum=962.504 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 61.127,
          "description": "min=61.127, mean=61.127, max=61.127, sum=61.127 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 220.0,
          "description": "min=220, mean=220, max=220, sum=220 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 2091.018181818182,
          "description": "min=2091.018, mean=2091.018, max=2091.018, sum=2091.018 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 167.0,
          "description": "min=167, mean=167, max=167, sum=167 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 472.91616766467064,
          "description": "min=472.916, mean=472.916, max=472.916, sum=472.916 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 86.0,
          "description": "min=86, mean=86, max=86, sum=258 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "n2c2_ct_matching:subject=CREATININE,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "n2c2_ct_matching:subject=CREATININE,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "n2c2_ct_matching:subject=CREATININE,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 5588.84496124031,
          "description": "min=5538.512, mean=5588.845, max=5665.512, sum=16766.535 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "n2c2_ct_matching:subject=CREATININE,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=3 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "n2c2_ct_matching:subject=ADVANCED-CAD,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct",
            "n2c2_ct_matching:subject=CREATININE,model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 708.639,
          "description": "min=708.639, mean=708.639, max=708.639, sum=708.639 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 23367.678,
          "description": "min=23367.678, mean=23367.678, max=23367.678, sum=23367.678 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimiciv_billing_code:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "value": 327.0,
          "description": "min=327, mean=327, max=327, sum=327 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 40350.51376146789,
          "description": "min=40350.514, mean=40350.514, max=40350.514, sum=40350.514 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 63948.647,
          "description": "min=63948.647, mean=63948.647, max=63948.647, sum=63948.647 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 15443.736,
          "description": "min=15443.736, mean=15443.736, max=15443.736, sum=15443.736 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:model=meta_llama-3.3-70b-instruct,model_deployment=stanfordhealthcare_llama-3.3-70b-instruct"
          ]
        }
      ],
      [
        {
          "value": "o3-mini (2025-01-31)",
          "description": "",
          "markdown": false
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 585.063,
          "description": "min=585.063, mean=585.063, max=585.063, sum=585.063 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 2.471,
          "description": "min=2.471, mean=2.471, max=2.471, sum=2.471 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medcalc_bench:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 78.61538461538461,
          "description": "min=28, mean=78.615, max=100, sum=1022 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=bipolar_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=chronic_pain,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=homelessness,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=liver_disease,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=major_depression,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=personality_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=substance_use_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=suicidal_behavior,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=tobacco_dependence,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=unemployment,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=bipolar_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=chronic_pain,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=homelessness,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=liver_disease,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=major_depression,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=personality_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=substance_use_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=suicidal_behavior,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=tobacco_dependence,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=unemployment,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=bipolar_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=chronic_pain,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=homelessness,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=liver_disease,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=major_depression,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=personality_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=substance_use_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=suicidal_behavior,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=tobacco_dependence,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=unemployment,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 489.1569475450976,
          "description": "min=447.511, mean=489.157, max=526.164, sum=6359.04 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=bipolar_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=chronic_pain,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=homelessness,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=liver_disease,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=major_depression,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=personality_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=substance_use_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=suicidal_behavior,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=tobacco_dependence,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=unemployment,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.9984615384615385,
          "description": "min=0.99, mean=0.998, max=1, sum=12.98 (13)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "clear:condition=alcohol_dependence,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=bipolar_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=chronic_pain,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=homelessness,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=liver_disease,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=major_depression,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=personality_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=substance_use_disorder,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=suicidal_behavior,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=tobacco_dependence,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "clear:condition=unemployment,max_eval_instances=100,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 427.0,
          "description": "min=427, mean=427, max=427, sum=427 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 789.3325526932084,
          "description": "min=789.333, mean=789.333, max=789.333, sum=789.333 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 631.599531615925,
          "description": "min=631.6, mean=631.6, max=631.6, sum=631.6 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_replicate:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 597.0,
          "description": "min=597, mean=597, max=597, sum=597 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 311.64489112227807,
          "description": "min=311.645, mean=311.645, max=311.645, sum=311.645 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 22.768844221105528,
          "description": "min=22.769, mean=22.769, max=22.769, sum=22.769 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medec:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "ehrshot:subject=lab_anemia,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "ehrshot:subject=new_hypertension,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "ehrshot:subject=lab_anemia,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "ehrshot:subject=new_hypertension,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "ehrshot:subject=lab_anemia,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "ehrshot:subject=new_hypertension,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 32868.44466666667,
          "description": "min=29647.986, mean=32868.445, max=35141.789, sum=98605.334 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "ehrshot:subject=lab_anemia,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "ehrshot:subject=new_hypertension,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.9993333333333333,
          "description": "min=0.997, mean=0.999, max=1.001, sum=2.998 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehrshot:subject=guo_readmission,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "ehrshot:subject=lab_anemia,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "ehrshot:subject=new_hypertension,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 150.624,
          "description": "min=150.624, mean=150.624, max=150.624, sum=150.624 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.988,
          "description": "min=0.988, mean=0.988, max=0.988, sum=0.988 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "head_qa:language=en,category=None,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 308.0,
          "description": "min=308, mean=308, max=308, sum=308 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 337.487012987013,
          "description": "min=337.487, mean=337.487, max=337.487, sum=337.487 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.9902597402597403,
          "description": "min=0.99, mean=0.99, max=0.99, sum=0.99 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medbullets:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 149.0,
          "description": "min=149, mean=149, max=149, sum=149 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 55993.476510067114,
          "description": "min=55993.477, mean=55993.477, max=55993.477, sum=55993.477 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 185.73825503355704,
          "description": "min=185.738, mean=185.738, max=185.738, sum=185.738 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medalign:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 423.0,
          "description": "min=423, mean=423, max=423, sum=423 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 284.16784869976357,
          "description": "min=284.168, mean=284.168, max=284.168, sum=284.168 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ptbm_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 915.0,
          "description": "min=915, mean=915, max=915, sum=915 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 754.4939890710383,
          "description": "min=754.494, mean=754.494, max=754.494, sum=754.494 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sei_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 3474.609,
          "description": "min=3474.609, mean=3474.609, max=3474.609, sum=3474.609 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 548.381,
          "description": "min=548.381, mean=548.381, max=548.381, sum=548.381 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "dischargeme:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 120.0,
          "description": "min=120, mean=120, max=120, sum=120 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1629.6166666666666,
          "description": "min=1629.617, mean=1629.617, max=1629.617, sum=1629.617 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 491.2583333333333,
          "description": "min=491.258, mean=491.258, max=491.258, sum=491.258 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "aci_bench:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 128.0,
          "description": "min=128, mean=128, max=128, sum=128 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 304.84375,
          "description": "min=304.844, mean=304.844, max=304.844, sum=304.844 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 684.5703125,
          "description": "min=684.57, mean=684.57, max=684.57, sum=684.57 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mtsamples_procedures:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 210.371,
          "description": "min=210.371, mean=210.371, max=210.371, sum=210.371 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 87.559,
          "description": "min=87.559, mean=87.559, max=87.559, sum=87.559 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_rrs:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 100.0,
          "description": "min=100, mean=100, max=100, sum=100 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 644.34,
          "description": "min=644.34, mean=644.34, max=644.34, sum=644.34 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 158.65,
          "description": "min=158.65, mean=158.65, max=158.65, sum=158.65 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimic_bhc:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 487.0,
          "description": "min=487, mean=487, max=487, sum=487 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 723.8459958932239,
          "description": "min=723.846, mean=723.846, max=723.846, sum=723.846 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 145.43326488706364,
          "description": "min=145.433, mean=145.433, max=145.433, sum=145.433 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "chw_care_plan:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 689.0,
          "description": "min=689, mean=689, max=689, sum=689 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 22.712626995645863,
          "description": "min=22.713, mean=22.713, max=22.713, sum=22.713 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 259.14513788098697,
          "description": "min=259.145, mean=259.145, max=259.145, sum=259.145 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medication_qa:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 361.0,
          "description": "min=361, mean=361, max=361, sum=361 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 2622.0193905817173,
          "description": "min=2622.019, mean=2622.019, max=2622.019, sum=2622.019 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 737.5761772853185,
          "description": "min=737.576, mean=737.576, max=737.576, sum=737.576 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "starr_patient_instructions:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=2000 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "med_dialog,subset=icliniq:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "med_dialog,subset=icliniq:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "med_dialog,subset=icliniq:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 261.2355,
          "description": "min=239.917, mean=261.236, max=282.554, sum=522.471 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "med_dialog,subset=icliniq:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 54.4685,
          "description": "min=54.326, mean=54.468, max=54.611, sum=108.937 (2)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_dialog,subset=healthcaremagic:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "med_dialog,subset=icliniq:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 2075.158,
          "description": "min=2075.158, mean=2075.158, max=2075.158, sum=2075.158 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_conf_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 150.0,
          "description": "min=150, mean=150, max=150, sum=150 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 38.16,
          "description": "min=38.16, mean=38.16, max=38.16, sum=38.16 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 403.76666666666665,
          "description": "min=403.767, mean=403.767, max=403.767, sum=403.767 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medi_qa:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 67.0,
          "description": "min=67, mean=67, max=67, sum=67 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 685.4179104477612,
          "description": "min=685.418, mean=685.418, max=685.418, sum=685.418 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 214.62686567164178,
          "description": "min=214.627, mean=214.627, max=214.627, sum=214.627 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mental_health:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 300.0,
          "description": "min=300, mean=300, max=300, sum=300 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 183.78666666666666,
          "description": "min=183.787, mean=183.787, max=183.787, sum=183.787 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_proxy_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 300.0,
          "description": "min=300, mean=300, max=300, sum=300 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 257.33,
          "description": "min=257.33, mean=257.33, max=257.33, sum=257.33 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_privacy_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 377.515,
          "description": "min=377.515, mean=377.515, max=377.515, sum=377.515 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.987,
          "description": "min=0.987, mean=0.987, max=0.987, sum=0.987 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "pubmed_qa:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 963.516,
          "description": "min=963.516, mean=963.516, max=963.516, sum=963.516 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 60.344,
          "description": "min=60.344, mean=60.344, max=60.344, sum=60.344 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "ehr_sql:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 220.0,
          "description": "min=220, mean=220, max=220, sum=220 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 2095.2636363636366,
          "description": "min=2095.264, mean=2095.264, max=2095.264, sum=2095.264 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_bmt_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 167.0,
          "description": "min=167, mean=167, max=167, sum=167 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 473.0239520958084,
          "description": "min=473.024, mean=473.024, max=473.024, sum=473.024 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "race_based_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 86.0,
          "description": "min=86, mean=86, max=86, sum=258 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "n2c2_ct_matching:subject=ADVANCED-CAD,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "n2c2_ct_matching:subject=CREATININE,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "n2c2_ct_matching:subject=ADVANCED-CAD,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "n2c2_ct_matching:subject=CREATININE,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "n2c2_ct_matching:subject=ADVANCED-CAD,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "n2c2_ct_matching:subject=CREATININE,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 5594.158914728682,
          "description": "min=5543.826, mean=5594.159, max=5670.826, sum=16782.477 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "n2c2_ct_matching:subject=ADVANCED-CAD,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "n2c2_ct_matching:subject=CREATININE,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=3 (3)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "n2c2_ct_matching:subject=ABDOMINAL,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "n2c2_ct_matching:subject=ADVANCED-CAD,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31",
            "n2c2_ct_matching:subject=CREATININE,num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 708.757,
          "description": "min=708.757, mean=708.757, max=708.757, sum=708.757 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.993,
          "description": "min=0.993, mean=0.993, max=0.993, sum=0.993 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "medhallu:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.001,
          "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 23387.861,
          "description": "min=23387.861, mean=23387.861, max=23387.861, sum=23387.861 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1.065,
          "description": "min=1.065, mean=1.065, max=1.065, sum=1.065 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_gip_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mimiciv_billing_code:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "description": "1 matching runs, but no matching metrics",
          "markdown": false
        },
        {
          "value": 327.0,
          "description": "min=327, mean=327, max=327, sum=327 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.009174311926605505,
          "description": "min=0.009, mean=0.009, max=0.009, sum=0.009 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 40377.91743119266,
          "description": "min=40377.917, mean=40377.917, max=40377.917, sum=40377.917 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_sequoia_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.018,
          "description": "min=0.018, mean=0.018, max=0.018, sum=0.018 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 63970.858,
          "description": "min=63970.858, mean=63970.858, max=63970.858, sum=63970.858 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1.736,
          "description": "min=1.736, mean=1.736, max=1.736, sum=1.736 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_cdi_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 15484.159,
          "description": "min=15484.159, mean=15484.159, max=15484.159, sum=15484.159 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        }
      ]
    ],
    "links": [
      {
        "text": "LaTeX",
        "href": "benchmark_output/releases/v2.0.0/groups/latex/medhelm_scenarios_general_information.tex"
      },
      {
        "text": "JSON",
        "href": "benchmark_output/releases/v2.0.0/groups/json/medhelm_scenarios_general_information.json"
      }
    ],
    "name": "general_information"
  }
]      "shc_ent_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 15484.159,
          "description": "min=15484.159, mean=15484.159, max=15484.159, sum=15484.159 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "shc_ent_med:num_output_tokens=4000,model=openai_o3-mini-2025-01-31,model_deployment=stanfordhealthcare_o3-mini-2025-01-31"
          ]
        }
      ]
    ],
    "links": [
      {
        "text": "LaTeX",
        "href": "benchmark_output/releases/v2.0.0/groups/latex/medhelm_scenarios_general_information.tex"
      },
      {
        "text": "JSON",
        "href": "benchmark_output/releases/v2.0.0/groups/json/medhelm_scenarios_general_information.json"
      }
    ],
    "name": "general_information"
  }
]

In [None]:
BASE = {
    "schema_version": "0.0.1",
    "evaluation_id": None,
    "retrieved_timestamp": ,
    "source_data_urls": ["https://storage.googleapis.com/crfm-helm-public/medhelm/benchmark_output/releases/v2.0.0/groups/medhelm_scenarios.json"],
    "leaderboard_name": "MedHelm",
    "evaluation_platform_name": {
        "type": "string",
        "description": "Title of the platform used for the evaluation."
    },
    "source_metadata": {
        "type": "object",
        "description": "Metadata about the source of the leaderboard data",
        "required": [
            "source_organization_name",
            "evaluator_relationship"
        ],
        "properties": {
            "source_organization_name": {
                "type": "string",
                "description": "Name of the organization that provides the data"
            },
            "source_organization_url": {
                "type": "string",
                "description": "URL for the organization that provides the data"
            },
            "source_organization_logo_url": {
                "type": "string",
                "description": "URL for the Logo for the organization that provides the data"
            },
            "evaluator_relationship": {
                "type": "string",
                "description": "Relationship between the evaluator and the model",
                "enum": [
                    "first_party",
                    "third_party",
                    "collaborative",
                    "other"
                ]
            }
        }
    },
    "model_info": {
        "type": "object",
        "description": "Complete model specification including basic information, technical configuration and inference settings",
        "required": [
                "name"
        ],
        "properties": {
            "name": {
                "type": "string",
                "description": "Model name in HuggingFace format (e.g. meta-llama/Llama-3.1-8B-Instruct)"
            },
            "developer": {
                "type": "string",
                "description": "Name of organization that provides the model (e.g. 'OpenAI')"
            },
            "inference_platform": {
                "type": "string",
                "description": "Description of platform used to run the evaluations (e.g. local machine, Bedrock)"
            }
        }
    },
    "evaluation_results": {
        "type": "array",
        "description": "Array of evaluation results",
        "items": {
            "type": "object",
            "required": [
                "evaluation_name",
                "metric_config",
                "score_details"
            ],
            "properties": {
                "evaluation_name": {
                    "type": "string",
                    "description": "Name of the evaluation"
                },
                "evaluation_timestamp": {
                    "type": "string",
                    "description": "Timestamp for when the evaluations were run"
                },
                "metric_config": {
                    "type": "object",
                    "description": "Details about the metric",
                    "required": [
                        "lower_is_better"
                    ],
                    "properties": {
                        "evaluation_description": {
                            "type": "string",
                            "description": "Description of the evaluation"
                        },
                        "lower_is_better": {
                            "type": "boolean",
                            "description": "Whether a lower score is better"
                        },
                        "score_type": {
                            "type": "string",
                            "description": "Type of score",
                            "enum": [
                                "binary",
                                "continuous",
                                "levels"
                            ]
                        },
                        "level_names": {
                            "type": "array",
                            "description": "Names of the score levels",
                            "items": {
                                "type": "string"
                            }
                        },
                        "level_metadata": {
                            "type": "array",
                            "description": "Additional Description for each Score Level",
                            "items": {
                                "type": "string"
                            }
                        },
                        "has_unknown_level": {
                            "type": "boolean",
                            "description": "Indicates whether there is an Unknown Level - if True, then a score of -1 will be treated as Unknown"
                        },
                        "min_score": {
                            "type": "number",
                            "description": "Minimum possible score for continuous metric"
                        },
                        "max_score": {
                            "type": "number",
                            "description": "Maximum possible score for continuous metric"
                        }
                    },
                    "if": {
                        "properties": {
                            "score_type": {
                                "const": "levels"
                            }
                        }
                    },
                    "then": {
                        "required": [
                            "level_names",
                            "has_unknown_level"
                        ]
                    },
                    "else": {
                        "if": {
                            "properties": {
                                "score_type": {
                                    "const": "continuous"
                                }
                            }
                        },
                        "then": {
                            "required": [
                                "min_score",
                                "max_score"
                            ]
                        }
                    }
                },
                "score_details": {
                    "type": "string",
                    "description": "The score for the evaluation and related details",
                    "required": [
                        "score"
                    ],
                    "properties": {
                        "score": {
                            "type": "number",
                            "description": "The score for the evaluation"
                        },
                        "details": {
                            "type": "object",
                            "description": "Any additional details about the score",
                            "additionalProperties": true
                        }
                    }
                },
                "detailed_evaluation_results_url": {
                    "type": "string",
                    "description": "Link to detailed evaluation data"
                },
                "generation_config": {
                    "type": "object",
                    "generation_args": {
                            "type": "object",
                            "description": "Parameters used to generate results - properties may vary by model type",
                            "properties": {
                                "temperature": {
                                    "type": [
                                        "null",
                                        "number"
                                    ],
                                    "description": "Sampling temperature"
                                },
                                "top_p": {
                                    "type": [
                                        "null",
                                        "number"
                                    ],
                                    "description": "Nucleus sampling parameter"
                                },
                                "top_k": {
                                    "type": [
                                        "null",
                                        "number"
                                    ],
                                    "description": "Top-k sampling parameter"
                                },
                                "max_tokens": {
                                    "type": "integer",
                                    "minimum": 1,
                                    "description": "Maximum number of tokens to generate"
                                }
                            },
                            "additionalProperties": true
                    },
                    "additional_details": {
                        "type": "string",
                        "description": "Additional details about how the results for this metric were generated."
                    }
                }
            }
        }

    }
}
}
