# Evaluating Pre-generated Model Responses on MeetingBank Dataset

This notebook demonstrates how to evaluate pre-generated model responses (including both Bedrock models and external models like Gemini Flash) on the MeetingBank dataset for meeting summarization tasks.

In [1]:
# Import required libraries
%load_ext autoreload
%autoreload 2
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import boto3
from datetime import datetime

# Import utility functions
from utils.dataset_utils import load_meetingbank_dataset, get_test_samples, prepare_for_bedrock_evaluation, load_evaluation_dataset
from utils.bedrock_utils import (
    create_s3_bucket_if_not_exists,
    apply_cors_if_not_exists,
    upload_to_s3,
    create_evaluation_job,
    wait_for_job_completion,
    download_evaluation_results,
    analyze_evaluation_results,
    visualize_evaluation_results,
    generate_model_responses
)

## 1. Configure AWS Credentials

Make sure you have AWS credentials configured with appropriate permissions for Amazon Bedrock and S3.

In [14]:
# Set AWS region
region = "us-east-1"  # Change to your preferred region where Bedrock is available
BEDROCK_ROLE_ARN = "arn:aws:iam::864016358360:role/service-role/Amazon-Bedrock-IAM-Role-20250531T202875"
bucket_name = 'eval-datasets-us-east-1'
NUM_SAMPLES_PER_EVAL = 862
system_prompt = "You are an expert meeting summarizer." # Optional system prompt for summarization

# Set IAM role ARN with permissions for Bedrock evaluation
os.environ["BEDROCK_ROLE_ARN"] = BEDROCK_ROLE_ARN

# Verify AWS credentials
try:
    sts = boto3.client('sts')
    identity = sts.get_caller_identity()
    print(f"AWS Identity verified: {identity['Arn']}")
except Exception as e:
    print(f"Error verifying AWS credentials: {e}")
    raise

AWS Identity verified: arn:aws:sts::864016358360:assumed-role/Admin/gili-Isengard


## 2. Set up Gemini Client (Optional)

Set up the Gemini client for evaluating Google's Gemini models.

In [None]:
# Set up Gemini client for external model evaluation
import sys

# Install the Google Generative AI library if not already installed
try:
    from google import genai
    from google.genai.types import GenerateContentConfig
    from utils.external_model_utils import generate_gemini_responses
except ImportError:
    print("Installing Google Generative AI library...")
    !pip install -q google-generativeai
    from google import genai
    from google.genai.types import GenerateContentConfig
    from utils.external_model_utils import generate_gemini_responses

# Authenticate with Google Cloud if running in Colab
if "google.colab" in sys.modules:
    from google.colab import auth
    auth.authenticate_user()

# Import for displaying results
from IPython.display import HTML, Markdown, display

# Configure project and location
PROJECT_ID = "genai-test-gili"  # @param {type: "string", placeholder: "[your-project-id]"}
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "global")

# Initialize the Gemini client
gemini_available = False
try:
    client = genai.Client(vertexai=True, project=PROJECT_ID, location=LOCATION)
    
    if not client.vertexai:
        print("Using Gemini Developer API.")
    elif client._api_client.project:
        print(f"Using Vertex AI with project: {client._api_client.project} in location: {client._api_client.location}")
    elif client._api_client.api_key:
        print(f"Using Vertex AI in express mode with API key: {client._api_client.api_key[:5]}...{client._api_client.api_key[-5:]}")
    
    # Test the client with a simple query
    GEMINI_MODEL_ID = "gemini-2.0-flash"  # @param {type: "string"}
    
    system_instruction = system_prompt
    response = client.models.generate_content(
        model=GEMINI_MODEL_ID, 
        contents="What's the largest planet in our solar system?",
        config=GenerateContentConfig(
            temperature=0,
            top_p=0.7,
            candidate_count=1,
            seed=0,
            max_output_tokens=2048,
            system_instruction=system_instruction
        ),
    )
    
    display(Markdown(response.text))
    print("Gemini client initialized successfully!")
    gemini_available = True
    
except Exception as e:
    print(f"Error initializing Gemini client: {e}")
    print("Continuing without Gemini models.")

INFO:google_genai.models:AFC is enabled with max remote calls: 10.


Using Vertex AI with project: genai-test-gili in location: global


INFO:httpx:HTTP Request: POST https://aiplatform.googleapis.com/v1beta1/projects/genai-test-gili/locations/global/publishers/google/models/gemini-2.0-flash:generateContent "HTTP/1.1 200 OK"
INFO:google_genai.models:AFC remote call 1 is done.


Jupiter is the largest planet in our solar system.


Gemini client initialized successfully!


## 3. Load MeetingBank Dataset and Prepare for Evaluation

In [16]:
# Load the dataset
dataset = load_meetingbank_dataset()
print(f"Dataset structure: {dataset}")
print(f"Available splits: {dataset.keys()}")

# Get samples from the test set
test_samples = get_test_samples(dataset, num_samples=NUM_SAMPLES_PER_EVAL)
print(f"Number of test samples: {len(test_samples)}")

# Prepare the dataset for Bedrock evaluation with empty modelResponses field
evaluation_dataset_path = prepare_for_bedrock_evaluation(test_samples, include_model_responses=True)
print(f"Evaluation dataset created at: {evaluation_dataset_path}")

Dataset structure: DatasetDict({
    train: Dataset({
        features: ['summary', 'uid', 'id', 'transcript'],
        num_rows: 5169
    })
    validation: Dataset({
        features: ['summary', 'uid', 'id', 'transcript'],
        num_rows: 861
    })
    test: Dataset({
        features: ['summary', 'uid', 'id', 'transcript'],
        num_rows: 862
    })
})
Available splits: dict_keys(['train', 'validation', 'test'])
Number of test samples: 862
Evaluation dataset created at: ./data/bedrock_evaluation_dataset.jsonl


## 4. Generate Model Responses

### 4.1 Generate Bedrock Model Responses

In [None]:
# Define the Bedrock models to use
bedrock_models = [
    {
        "name" : "nova-micro",
        "model_id" : "us.amazon.nova-micro-v1:0",
    },
    {
        "name" : "nova-lite",
        "model_id" : "us.amazon.nova-lite-v1:0",
    },
    {
        "name" : "nova-pro",
        "model_id" : "us.amazon.nova-pro-v1:0",
    },
    {
        "name" : "nova-premier",
        "model_id" : "us.amazon.nova-premier-v1:0",
    },
    {
        "name" : "haiku-3",
        "model_id" : "us.anthropic.claude-3-haiku-20240307-v1:0",
    },
    {
        "name" : "sonnet-3-5-v2",
        "model_id" : "us.anthropic.claude-3-5-sonnet-20241022-v2:0",
    },
]



# Generate responses (creates separate files for each model)
bedrock_dataset_paths = generate_model_responses(
    dataset_path=evaluation_dataset_path,
    models=bedrock_models,
    system_prompt=system_prompt,
    region=region
)

print(f"Datasets with Bedrock model responses saved to:")
for model_name, path in bedrock_dataset_paths.items():
    print(f"  - {model_name}: {path}")

INFO:utils.bedrock_utils:Generating responses for model claude-3-haiku
INFO:utils.bedrock_utils:Processing prompt 1/10 for model claude-3-haiku
INFO:utils.bedrock_utils:Generating response with model us.anthropic.claude-3-haiku-20240307-v1:0
INFO:utils.bedrock_utils:Processing prompt 2/10 for model claude-3-haiku
INFO:utils.bedrock_utils:Generating response with model us.anthropic.claude-3-haiku-20240307-v1:0
INFO:utils.bedrock_utils:Processing prompt 3/10 for model claude-3-haiku
INFO:utils.bedrock_utils:Generating response with model us.anthropic.claude-3-haiku-20240307-v1:0
INFO:utils.bedrock_utils:Processing prompt 4/10 for model claude-3-haiku
INFO:utils.bedrock_utils:Generating response with model us.anthropic.claude-3-haiku-20240307-v1:0
INFO:utils.bedrock_utils:Processing prompt 5/10 for model claude-3-haiku
INFO:utils.bedrock_utils:Generating response with model us.anthropic.claude-3-haiku-20240307-v1:0
INFO:utils.bedrock_utils:Processing prompt 6/10 for model claude-3-haiku
I

Datasets with Bedrock model responses saved to:
  - claude-3-haiku: ./data/bedrock_evaluation_dataset_claude-3-haiku.jsonl
  - nova-lite: ./data/bedrock_evaluation_dataset_nova-lite.jsonl


### 4.2 Generate Gemini Model Responses (if available)

In [None]:
# Generate responses from Gemini models if available
gemini_dataset_paths = {}

if gemini_available:
    # Define the Gemini models to use
    gemini_models = ["gemini-2.0-flash"]
    
    for model_id in gemini_models:
        print(f"Generating responses for {model_id}...")
        try:
            output_path = generate_gemini_responses(
                dataset_path=evaluation_dataset_path,
                model_id=model_id,
                system_prompt=system_prompt,
                project_id=PROJECT_ID,
                location=LOCATION
            )
            gemini_dataset_paths[model_id] = output_path
            print(f"Responses for {model_id} saved to {output_path}")
        except Exception as e:
            print(f"Error generating responses for {model_id}: {e}")
    
    print(f"\nDatasets with Gemini model responses:")
    for model_id, path in gemini_dataset_paths.items():
        print(f"  - {model_id}: {path}")
else:
    print("Skipping Gemini model response generation as Gemini client is not available.")

INFO:utils.external_model_utils:Generating responses for 862 prompts using Gemini model gemini-2.0-flash
INFO:utils.external_model_utils:Initialized Gemini client for model gemini-2.0-flash
INFO:utils.external_model_utils:Processing prompt 1/862 for model gemini-2.0-flash
INFO:google_genai.models:AFC is enabled with max remote calls: 10.


Generating responses for gemini-2.0-flash...


INFO:httpx:HTTP Request: POST https://aiplatform.googleapis.com/v1beta1/projects/genai-test-gili/locations/global/publishers/google/models/gemini-2.0-flash:generateContent "HTTP/1.1 200 OK"
INFO:google_genai.models:AFC remote call 1 is done.
INFO:utils.external_model_utils:Successfully generated response for prompt 1
INFO:utils.external_model_utils:Processing prompt 2/862 for model gemini-2.0-flash
INFO:google_genai.models:AFC is enabled with max remote calls: 10.
INFO:httpx:HTTP Request: POST https://aiplatform.googleapis.com/v1beta1/projects/genai-test-gili/locations/global/publishers/google/models/gemini-2.0-flash:generateContent "HTTP/1.1 200 OK"
INFO:google_genai.models:AFC remote call 1 is done.
INFO:utils.external_model_utils:Successfully generated response for prompt 2
INFO:utils.external_model_utils:Processing prompt 3/862 for model gemini-2.0-flash
INFO:google_genai.models:AFC is enabled with max remote calls: 10.
INFO:httpx:HTTP Request: POST https://aiplatform.googleapis.co

## 5. Combine All Model Datasets

In [11]:
# Combine all model dataset paths
all_model_dataset_paths = {**bedrock_dataset_paths}
if gemini_available:
    all_model_dataset_paths.update(gemini_dataset_paths)

print(f"Total models for evaluation: {len(all_model_dataset_paths)}")
for model_name, path in all_model_dataset_paths.items():
    print(f"  - {model_name}: {path}")

Total models for evaluation: 3
  - claude-3-haiku: ./data/bedrock_evaluation_dataset_claude-3-haiku.jsonl
  - nova-lite: ./data/bedrock_evaluation_dataset_nova-lite.jsonl
  - gemini-2.0-flash: ./data/bedrock_evaluation_dataset_gemini-2-0-flash.jsonl


## 6. Upload Dataset with Responses to S3

In [12]:
create_s3_bucket_if_not_exists(bucket_name, region)
apply_cors_if_not_exists(bucket_name, region)

# Upload the evaluation datasets with responses to S3
timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
dataset_s3_uris = {}

for model_name, dataset_path in all_model_dataset_paths.items():
    # Sanitize model name for S3 key
    sanitized_model_name = model_name.replace('.', '-').replace(':', '-')
    dataset_s3_key = f"evaluation/meetingbank_dataset_{sanitized_model_name}_{timestamp}.jsonl"
    dataset_s3_uri = upload_to_s3(dataset_path, bucket_name, dataset_s3_key, region)
    dataset_s3_uris[model_name] = dataset_s3_uri
    print(f"Dataset for {model_name} uploaded to: {dataset_s3_uri}")

# Define the output location in S3
output_s3_uri = f"s3://{bucket_name}/evaluation/results/"
print(f"Results will be stored at: {output_s3_uri}")

Bucket eval-datasets-us-east-1 already exists
CORS configuration already exists for bucket eval-datasets-us-east-1
Dataset for claude-3-haiku uploaded to: s3://eval-datasets-us-east-1/evaluation/meetingbank_dataset_claude-3-haiku_20250602195427.jsonl
Dataset for nova-lite uploaded to: s3://eval-datasets-us-east-1/evaluation/meetingbank_dataset_nova-lite_20250602195427.jsonl
Dataset for gemini-2.0-flash uploaded to: s3://eval-datasets-us-east-1/evaluation/meetingbank_dataset_gemini-2-0-flash_20250602195427.jsonl
Results will be stored at: s3://eval-datasets-us-east-1/evaluation/results/


## 7. Create and Run Bedrock Evaluation Job with Pre-generated Responses

In [13]:
# Create evaluation jobs for each model
job_arns = {}

for model_name, dataset_s3_uri in dataset_s3_uris.items():
    # Sanitize model name for job name
    sanitized_model_name = model_name.replace('.', '-').replace(':', '-')
    job_name = f"meetingbank-{sanitized_model_name}-{timestamp}"
    
    print(f"Creating evaluation job for {model_name}...")
    try:
        job_arn = create_evaluation_job(
            job_name=job_name,
            dataset_s3_uri=dataset_s3_uri,
            output_s3_uri=output_s3_uri,
            use_pregenerated_responses=True,
            model_id=sanitized_model_name,  # Use sanitized name as identifier
            region=region
        )
        job_arns[model_name] = job_arn
        print(f"Evaluation job for {model_name} created with ARN: {job_arn}")
    except Exception as e:
        print(f"Error creating evaluation job for {model_name}: {e}")

Creating evaluation job for claude-3-haiku...
Evaluation job for claude-3-haiku created with ARN: arn:aws:bedrock:us-east-1:864016358360:evaluation-job/hzfy5vu2s1cd
Creating evaluation job for nova-lite...
Evaluation job for nova-lite created with ARN: arn:aws:bedrock:us-east-1:864016358360:evaluation-job/ialy9lfurf0x
Creating evaluation job for gemini-2.0-flash...
Evaluation job for gemini-2.0-flash created with ARN: arn:aws:bedrock:us-east-1:864016358360:evaluation-job/uyu44egvbncc


In [None]:
# Wait for all jobs to complete
job_details = {}

for model_name, job_arn in job_arns.items():
    print(f"Waiting for {model_name} evaluation job to complete...")
    try:
        details = wait_for_job_completion(job_arn, region)
        job_details[model_name] = details
        print(f"Job for {model_name} completed with status: {details['status']}")
    except Exception as e:
        print(f"Error waiting for {model_name} job: {e}")

## 8. Download and Analyze Results

In [None]:
results_dir = './results'
os.makedirs(results_dir, exist_ok=True)

# Download results for each model
for model_name, job_details in job_details.items():
    sanitized_model_name = model_name.replace('.', '-').replace(':', '-')
    job_name = job_details.get('jobName', f"meetingbank-{sanitized_model_name}-{timestamp}")
    results_local_path = f'{results_dir}/{job_name}'
    
    # Download the evaluation results
    results_base_dir_s3 = f"{output_s3_uri}/{job_name}/"
    print(f"Downloading results for {model_name} from {results_base_dir_s3}")
    
    try:
        download_evaluation_results(results_base_dir_s3, results_local_path, region)
        print(f"Results for {model_name} downloaded to: {results_local_path}")
    except Exception as e:
        print(f"Error downloading results for {model_name}: {e}")

In [None]:
# Analyze the evaluation results
print("Analyzing evaluation results...")
results_df = analyze_evaluation_results(results_dir)

# Display the results
print("\nEvaluation Results:")
print(results_df)

In [None]:
# Visualize the results
print("Creating visualization...")
ax = visualize_evaluation_results(results_df, output_path='evaluation_results_all_models.png')
plt.show()

## 9. Conclusion

This notebook demonstrated how to:
1. Load the MeetingBank dataset
2. Generate model responses using both Bedrock models and Gemini models
3. Create and run Bedrock evaluation jobs with pre-generated responses
4. Analyze and visualize the evaluation results

This approach allows you to evaluate and compare any models, including both Bedrock models and external models like Gemini Flash, using Amazon Bedrock's evaluation capabilities.