# Evaluating Pre-generated Model Responses on MeetingBank Dataset

This notebook demonstrates how to evaluate pre-generated model responses (including non-Bedrock models like Gemini Flash) on the MeetingBank dataset for meeting summarization tasks.

In [1]:
# Import required libraries
%load_ext autoreload
%autoreload 2
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import boto3
from datetime import datetime

# Import utility functions
from utils.dataset_utils import load_meetingbank_dataset, get_test_samples, prepare_for_bedrock_evaluation, load_evaluation_dataset
from utils.bedrock_utils import (
    create_s3_bucket_if_not_exists,
    apply_cors_if_not_exists,
    upload_to_s3,
    create_evaluation_job,
    wait_for_job_completion,
    download_evaluation_results,
    analyze_evaluation_results,
    visualize_evaluation_results,
    generate_model_responses
)

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

import os
from google import genai

client = genai.Client(vertexai=True, project='genai-test-gili', location=LOCATION)

from IPython.display import HTML, Markdown, display
from google import genai
from google.genai.types import (
    FunctionDeclaration,
    GenerateContentConfig,
    GoogleSearch,
    HarmBlockThreshold,
    HarmCategory,
    MediaResolution,
    Part,
    Retrieval,
    SafetySetting,
    Tool,
    ToolCodeExecution,
    VertexAISearch,
)
     

import os

PROJECT_ID = "genai-test-gili"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "global")

client = genai.Client(vertexai=True, project=PROJECT_ID, location=LOCATION)

if not client.vertexai:
    print("Using Gemini Developer API.")
elif client._api_client.project:
    print(
        f"Using Vertex AI with project: {client._api_client.project} in location: {client._api_client.location}"
    )
elif client._api_client.api_key:
    print(
        f"Using Vertex AI in express mode with API key: {client._api_client.api_key[:5]}...{client._api_client.api_key[-5:]}"
    )

MODEL_ID = "gemini-2.0-flash"  # @param {type: "string"}

system_instruction = "You are an expert meeting summarizer."
response = client.models.generate_content(
    model=MODEL_ID, contents="What's the largest planet in our solar system?"
    config=GenerateContentConfig(
        temperature=0,
        top_p=0.7,
        candidate_count=1,
        seed=0,
        max_output_tokens=2048,
        system_instruction=system_instruction
    ),
)

display(Markdown(response.text))

Using Vertex AI with project: genai-test-gili in location: global


## 1. Configure AWS Credentials

Make sure you have AWS credentials configured with appropriate permissions for Amazon Bedrock and S3.

In [2]:
# Set AWS region
region = "us-east-1"  # Change to your preferred region where Bedrock is available
BEDROCK_ROLE_ARN = "arn:aws:iam::864016358360:role/service-role/Amazon-Bedrock-IAM-Role-20250531T202875"
bucket_name = 'eval-datasets-us-east-1'
NUM_SAMPLES_PER_EVAL = 10  # Reduced for demonstration

# Set IAM role ARN with permissions for Bedrock evaluation
os.environ["BEDROCK_ROLE_ARN"] = BEDROCK_ROLE_ARN

# Verify AWS credentials
try:
    sts = boto3.client('sts')
    identity = sts.get_caller_identity()
    print(f"AWS Identity verified: {identity['Arn']}")
except Exception as e:
    print(f"Error verifying AWS credentials: {e}")
    raise

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


AWS Identity verified: arn:aws:sts::864016358360:assumed-role/Admin/gili-Isengard


## 2. Load MeetingBank Dataset and Prepare for Evaluation

In [4]:
# Load the dataset
dataset = load_meetingbank_dataset()
print(f"Dataset structure: {dataset}")
print(f"Available splits: {dataset.keys()}")

# Get samples from the test set
test_samples = get_test_samples(dataset, num_samples=NUM_SAMPLES_PER_EVAL)
print(f"Number of test samples: {len(test_samples)}")

# Prepare the dataset for Bedrock evaluation with empty modelResponses field
evaluation_dataset_path = prepare_for_bedrock_evaluation(test_samples, include_model_responses=True)
print(f"Evaluation dataset created at: {evaluation_dataset_path}")

Dataset structure: DatasetDict({
    train: Dataset({
        features: ['summary', 'uid', 'id', 'transcript'],
        num_rows: 5169
    })
    validation: Dataset({
        features: ['summary', 'uid', 'id', 'transcript'],
        num_rows: 861
    })
    test: Dataset({
        features: ['summary', 'uid', 'id', 'transcript'],
        num_rows: 862
    })
})
Available splits: dict_keys(['train', 'validation', 'test'])
Number of test samples: 10
Evaluation dataset created at: ./data/bedrock_evaluation_dataset.jsonl


## 3. Generate Model Responses

Here we'll use the Converse API to generate responses from different models.

In [6]:
# Define the models to use
models = [
    {
        "name": "claude-3-haiku",
        "model_id": "us.anthropic.claude-3-haiku-20240307-v1:0"
    },
    {
        "name": "nova-lite",
        "model_id": "us.amazon.nova-lite-v1:0"
    }
    # Add other models as needed
    # For non-Bedrock models like Gemini Flash, you would need to implement a separate function
    # to call their respective APIs and then format the responses in the same structure
]

# Optional system prompt for summarization
system_prompt = "You are an expert meeting summarizer."

# Generate responses (creates separate files for each model)
model_dataset_paths = generate_model_responses(
    dataset_path=evaluation_dataset_path,
    models=models,
    system_prompt=system_prompt,
    region=region
)

print(f"Datasets with model responses saved to:")
for model_name, path in model_dataset_paths.items():
    print(f"  - {model_name}: {path}")

INFO:utils.bedrock_utils:Generating responses for model claude-3-haiku
INFO:utils.bedrock_utils:Processing prompt 1/10 for model claude-3-haiku
INFO:utils.bedrock_utils:Generating response with model us.anthropic.claude-3-haiku-20240307-v1:0
INFO:utils.bedrock_utils:Input tokens: 1430
INFO:utils.bedrock_utils:Output tokens: 286
INFO:utils.bedrock_utils:Total tokens: 1716
INFO:utils.bedrock_utils:Stop reason: end_turn
INFO:utils.bedrock_utils:Processing prompt 2/10 for model claude-3-haiku
INFO:utils.bedrock_utils:Generating response with model us.anthropic.claude-3-haiku-20240307-v1:0
INFO:utils.bedrock_utils:Input tokens: 727
INFO:utils.bedrock_utils:Output tokens: 210
INFO:utils.bedrock_utils:Total tokens: 937
INFO:utils.bedrock_utils:Stop reason: end_turn
INFO:utils.bedrock_utils:Processing prompt 3/10 for model claude-3-haiku
INFO:utils.bedrock_utils:Generating response with model us.anthropic.claude-3-haiku-20240307-v1:0
INFO:utils.bedrock_utils:Input tokens: 358
INFO:utils.bedroc

Datasets with model responses saved to:
  - claude-3-haiku: ./data/bedrock_evaluation_dataset_claude-3-haiku.jsonl
  - nova-lite: ./data/bedrock_evaluation_dataset_nova-lite.jsonl


## 4. Upload Dataset with Responses to S3

In [7]:
create_s3_bucket_if_not_exists(bucket_name, region)
apply_cors_if_not_exists(bucket_name, region)

# Upload the evaluation datasets with responses to S3
timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
dataset_s3_uris = {}

for model_name, dataset_path in model_dataset_paths.items():
    dataset_s3_key = f"evaluation/meetingbank_dataset_{model_name}_{timestamp}.jsonl"
    dataset_s3_uri = upload_to_s3(dataset_path, bucket_name, dataset_s3_key, region)
    dataset_s3_uris[model_name] = dataset_s3_uri
    print(f"Dataset for {model_name} uploaded to: {dataset_s3_uri}")

# Define the output location in S3
output_s3_uri = f"s3://{bucket_name}/evaluation/results/"
print(f"Results will be stored at: {output_s3_uri}")

Bucket eval-datasets-us-east-1 already exists
CORS configuration already exists for bucket eval-datasets-us-east-1
Dataset for claude-3-haiku uploaded to: s3://eval-datasets-us-east-1/evaluation/meetingbank_dataset_claude-3-haiku_20250602180648.jsonl
Dataset for nova-lite uploaded to: s3://eval-datasets-us-east-1/evaluation/meetingbank_dataset_nova-lite_20250602180648.jsonl
Results will be stored at: s3://eval-datasets-us-east-1/evaluation/results/


## 5. Create and Run Bedrock Evaluation Job with Pre-generated Responses

In [10]:
# Create evaluation jobs for each model
job_arns = {}

for model_name, dataset_s3_uri in dataset_s3_uris.items():
    # Create a unique job name for this model
    job_name = f"meetingbank-{model_name}-{timestamp}"
    
    print(f"Creating evaluation job for {model_name}...")
    try:
        job_arn = create_evaluation_job(
            job_name=job_name,
            dataset_s3_uri=dataset_s3_uri,
            output_s3_uri=output_s3_uri,
            use_pregenerated_responses=True,
            model_id=model_name,
            region=region
        )
        job_arns[model_name] = job_arn
        print(f"Evaluation job for {model_name} created with ARN: {job_arn}")
    except Exception as e:
        print(f"Error creating evaluation job for {model_name}: {e}")

Creating evaluation job for claude-3-haiku...
Evaluation job for claude-3-haiku created with ARN: arn:aws:bedrock:us-east-1:864016358360:evaluation-job/wuk6fp3hd4k2
Creating evaluation job for nova-lite...
Evaluation job for nova-lite created with ARN: arn:aws:bedrock:us-east-1:864016358360:evaluation-job/p5fwdy446g3n


In [11]:
# Wait for all jobs to complete
job_details = {}

for model_name, job_arn in job_arns.items():
    print(f"Waiting for {model_name} evaluation job to complete...")
    try:
        details = wait_for_job_completion(job_arn, region)
        job_details[model_name] = details
        print(f"Job for {model_name} completed with status: {details['status']}")
    except Exception as e:
        print(f"Error waiting for {model_name} job: {e}")

Waiting for claude-3-haiku evaluation job to complete...
Job status: InProgress. Waiting 60 seconds...
Job status: InProgress. Waiting 60 seconds...
Job status: InProgress. Waiting 60 seconds...
Job status: InProgress. Waiting 60 seconds...
Job status: InProgress. Waiting 60 seconds...
Job status: InProgress. Waiting 60 seconds...
Job status: InProgress. Waiting 60 seconds...
Job status: InProgress. Waiting 60 seconds...
Job status: InProgress. Waiting 60 seconds...
Job status: InProgress. Waiting 60 seconds...
Job status: InProgress. Waiting 60 seconds...
Job for claude-3-haiku completed with status: Completed
Waiting for nova-lite evaluation job to complete...
Job for nova-lite completed with status: Completed


## 6. Download and Analyze Results

In [None]:
results_dir = './results'
os.makedirs(results_dir, exist_ok=True)

# Download results for each model
for model_name, job_details in job_details.items():
    job_name = job_details.get('jobName', f"meetingbank-{model_name}-{timestamp}")
    results_local_path = f'{results_dir}/{job_name}'
    
    # Download the evaluation results
    results_base_dir_s3 = f"{output_s3_uri}/{job_name}/"
    print(f"Downloading results for {model_name} from {results_base_dir_s3}")
    
    try:
        download_evaluation_results(results_base_dir_s3, results_local_path, region)
        print(f"Results for {model_name} downloaded to: {results_local_path}")
    except Exception as e:
        print(f"Error downloading results for {model_name}: {e}")

In [None]:
# Analyze the evaluation results
print("Analyzing evaluation results...")
results_df = analyze_evaluation_results(results_dir)

# Display the results
print("\nEvaluation Results:")
print(results_df)

In [None]:
# Visualize the results
print("Creating visualization...")
ax = visualize_evaluation_results(results_df, output_path='evaluation_results_pregenerated.png')
plt.show()

## 7. Conclusion

This notebook demonstrated how to:
1. Load the MeetingBank dataset
2. Generate model responses using the Converse API
3. Create and run a Bedrock evaluation job with pre-generated responses
4. Analyze and visualize the evaluation results

This approach allows you to evaluate any model, including non-Bedrock models like Gemini Flash, by pre-generating the responses and including them in the evaluation dataset.