# LLM Comparator: Running Comparative Evaluations with Google Vertex AI

In [4]:
#@title Licensed under the Apache License, Version 2.0
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [5]:
#@title Install the LLM Comparator package
! pip install llm_comparator



In [7]:
#@title Import relevant packages
import vertexai
from google.colab import auth

# The comparison library provides the primary API for running Comparative
# Evaluations and generating the JSON files required by the LLM Comparator web
# app.
from llm_comparator import comparison

# The model_helper library is used to initialize API wrapper to interface with
# models. For this demo we focus on models served by Google Vertex AI, but you
# can extend the llm_comparator.model_helper.GenerationModelHelper and
# llm_comparator.model_helper.EmbeddingModelHelper classes to work with other
# providers or models you host yourself.
from llm_comparator import model_helper

# The following libraries contain wrappers that implement the core functionality
# of the Comparative Evaluation workflow. More on these below.
from llm_comparator import llm_judge_runner
from llm_comparator import rationale_bullet_generator
from llm_comparator import rationale_cluster_generator

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
#@title Setup and authenticate with Google Vertex AI.
PROJECT_ID = 'your_project_id'  #@param {type: "string"}
REGION = 'us-central1'  #@param {type: "string"}

auth.authenticate_user()
! gcloud config set project {PROJECT_ID}
vertexai.init(project=PROJECT_ID, location=REGION)

NameError: name 'auth' is not defined

In [None]:
#@title Prepare Your Inputs

# See llm_comparator.llm_judge_runner.LLMJudgeInput for the required input type.
llm_judge_inputs = [
    {'prompt': 'how are you?', 'response_a': 'good', 'response_b': 'bad'},
    {'prompt': 'hello?', 'response_a': 'hello', 'response_b': 'hi'},
    {'prompt': 'what is the capital of korea?', 'response_a': 'Seoul', 'response_b': 'Vancouver'}
]

In [None]:
#@title Initialize models used in the LLM Comparator evaluation.

# The generator model can be any Text-to-Text LLM provided by Vertex AI. This
# model will be asked to do a series of tasks---judge, bulletize, and cluster---
# and it is often beneficial to use a larger model for this reason.
#
# We default to 'gemini-pro' but you can change this with the `model_name=`
# param. For a full list of models available via the Model Garden, check out
# https://console.cloud.google.com/vertex-ai/model-garden?pageState=(%22galleryStateKey%22:(%22f%22:(%22g%22:%5B%22supportedTasks%22,%22inputTypes%22%5D,%22o%22:%5B%22GENERATION%22,%22LANGUAGE%22%5D),%22s%22:%22%22)).
#
# Since we're using Gemini Pro, a very competent and flexible foundation model,
# we are sharing the same generator across all downstream tasks. However, you
# could use different models for each task if desired.
generator = model_helper.VertexGenerationModelHelper()

# The embedding model can be any text embedder provided by Vertex AI. We default
# to 'textembedding-gecko@003' but you can change this with the `model_name=`
# param. For a full list of models available via the Model Garden, check out
# https://console.cloud.google.com/vertex-ai/model-garden?pageState=(%22galleryStateKey%22:(%22f%22:(%22g%22:%5B%22supportedTasks%22,%22inputTypes%22%5D,%22o%22:%5B%22EMBEDDING%22,%22LANGUAGE%22%5D),%22s%22:%22%22))
embedder = model_helper.VertexEmbeddingModelHelper()

# The following models do the core work of a Comparative Evaluation: judge,
# bulletize, and cluster. Each class provides a `.run()` function, and the
# `llm_comparator.comparison.run()` API orchestrates configuring and calling
# these APIs on the instances you pass in. More on how to configure these below.

# The `judge` is the model responsible for actually doing the comparison between
# the two models. The same judge is run multiple times to get a diversity of
# perspectives, more on how to configure this below.
#
# A judge must phrase its responses in a simple XML format that includes the
# verdict and an explanation of the results, to enable downstream processing by
# the bulletizer and clusterer.
#
#     <result>
#       <explanation>YOUR EXPLANATION GOES HERE.</explanation>
#       <verdict>A is slightly better</verdict>
#     </result>
#
# We provide a default "judge" prompt in
# llm_comparator.llm_judge_runner.DEFAULT_LLM_JUDGE_PROMPT_TEMPLATE, and you can
# use the `llm_judge_prompt_template=` parameter to provide a custom prompt that
# may better suit your needs.
judge = llm_judge_runner.LLMJudgeRunner(generator)

# The `bulletizer` condenses the results provided by the judge into a set of
# bullets to make them easier to understand and consume in the UI.
bulletizer = rationale_bullet_generator.RationaleBulletGenerator(generator)

# The `clusterer` takes the bullets, embeds them, groups them into clusters
# based on embedding similarity, and generates a label for those clusters.
clusterer = rationale_cluster_generator.RationaleClusterGenerator(
    generator, embedder
)

In [None]:
#@title Run the Comparative Evaluation.

# The comparison.run() function is the primary interface for running a
# Comparative Evaluation. It take your prepared inputs, a judge, a buletizer,
# and a clusterer and returns a Python dictioary in the required format for use
# in the LLM Comparator web app. You can inspect this dictionary in Python if
# you like, but it's more useful once written to a file.
#
# The example below is basic, but you can use the judge_opts=, bulletizer_opts=,
# and/or clusterer_opts= parameters (all of which are optional dictionaries that
# are converted to keyword options) to further customize the behaviors. See the
# Docsrtrings for more.
comparison_result = comparison.run(
    llm_judge_inputs,
    judge,
    bulletizer,
    clusterer,
)

In [None]:
#@title [Optional] Save the results to a file.
file_path = 'json_for_llm_comparator.json' # @param {type: "string"}
comparison.write(comparison_result, file_path)

In [None]:
#@title [Optional] View the results in the app in Colab.
comparison.show_in_colab(file_path)