# Evaluating Nova.lite and Nova.pro on MeetingBank Dataset

This notebook demonstrates how to evaluate Amazon Bedrock models (Nova.lite and Nova.pro) on the MeetingBank dataset for meeting summarization tasks.

In [1]:
# Import required libraries
%load_ext autoreload
%autoreload 2
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import boto3
from datetime import datetime

# Import utility functions
from utils.dataset_utils import load_meetingbank_dataset, get_test_samples, prepare_for_bedrock_evaluation
from utils.bedrock_utils import (
    create_s3_bucket_if_not_exists,
    apply_cors_if_not_exists,
    upload_to_s3,
    create_evaluation_job,
    wait_for_job_completion,
    download_evaluation_results,
    analyze_evaluation_results,
    visualize_evaluation_results
)

## 1. Configure AWS Credentials

Make sure you have AWS credentials configured with appropriate permissions for Amazon Bedrock and S3.

In [None]:
# Set AWS region
region = "us-east-1"  # Change to your preferred region where Bedrock is available
BEDROCK_ROLE_ARN = "arn:aws:iam::864016358360:role/service-role/Amazon-Bedrock-IAM-Role-20250531T202875"
bucket_name = 'eval-datasets-us-east-1'
NUM_SAMPLES_PER_EVAL = 862

# Set IAM role ARN with permissions for Bedrock evaluation
# This role needs permissions to access S3 and invoke Bedrock models
#os.environ["BEDROCK_ROLE_ARN"] = "arn:aws:iam::YOUR_ACCOUNT_ID:role/YOUR_BEDROCK_ROLE"  # Replace with your role ARN
os.environ["BEDROCK_ROLE_ARN"] = BEDROCK_ROLE_ARN

# Verify AWS credentials
try:
    sts = boto3.client('sts')
    identity = sts.get_caller_identity()
    print(f"AWS Identity verified: {identity['Arn']}")
except Exception as e:
    print(f"Error verifying AWS credentials: {e}")
    raise

## 2. Load MeetingBank Dataset

In [None]:
# Load the dataset
dataset = load_meetingbank_dataset()
print(f"Dataset structure: {dataset}")
print(f"Available splits: {dataset.keys()}")

In [None]:
# Get the first {NUM_SAMPLES_PER_EVAL} samples from the test set
test_samples = get_test_samples(dataset, num_samples=NUM_SAMPLES_PER_EVAL)
print(f"Number of test samples: {len(test_samples)}")

# Display sample information
for i, sample in enumerate(test_samples[:5]):
    print(f"\nSample {i+1}:")
    print(f"Transcript length: {len(sample['transcript'])} characters")
    print(f"Summary length: {len(sample['summary'])} characters")
    print(f"Summary: {sample['summary'][:200]}...")

## 3. Prepare Dataset for Bedrock Evaluation

In [None]:
# Prepare the dataset for Bedrock evaluation
evaluation_dataset_path = prepare_for_bedrock_evaluation(test_samples)
print(f"Evaluation dataset created at: {evaluation_dataset_path}")

# Display the content of the evaluation dataset
with open(evaluation_dataset_path, 'r') as f:
    for i, line in enumerate(f):
        if i >= 5:  # Only show first 5 records
            break
        record = json.loads(line)
        print(f"\nRecord {i+1}:")
        print(f"Prompt length: {len(record['prompt'])} characters")
        print(f"Reference response length: {len(record['referenceResponse'])} characters")
        print(f"Category: {record['category']}")

## 4. Upload Dataset to S3

In [None]:
create_s3_bucket_if_not_exists(bucket_name, region)
apply_cors_if_not_exists(bucket_name, region)

# Upload the evaluation dataset to S3
dataset_s3_key = "evaluation/meetingbank_dataset.jsonl"
dataset_s3_uri = upload_to_s3(evaluation_dataset_path, bucket_name, dataset_s3_key, region)
print(f"Dataset uploaded to: {dataset_s3_uri}")

# Define the output location in S3
output_s3_uri = f"s3://{bucket_name}/evaluation/results/"
print(f"Results will be stored at: {output_s3_uri}")

## 5. Create and Run Bedrock Evaluation Job

In [None]:
#Define the models to evaluate
models = [
    {
        "name" : "nova-micro",
        "model_id" : "us.amazon.nova-micro-v1:0",
    },
    {
        "name" : "nova-lite",
        "model_id" : "us.amazon.nova-lite-v1:0",
    },
    {
        "name" : "nova-pro",
        "model_id" : "us.amazon.nova-pro-v1:0",
    },
    {
        "name" : "nova-premier",
        "model_id" : "us.amazon.nova-premier-v1:0",
    },
    {
        "name" : "haiku-3",
        "model_id" : "us.anthropic.claude-3-haiku-20240307-v1:0",
    },
    {
        "name" : "sonnet-3-5-v2",
        "model_id" : "us.anthropic.claude-3-5-sonnet-20241022-v2:0",
    },
]


for model in models:
    print(f"Model: {model}")
    model["name"] = model["name"].lower()
    model_name = model["name"]
    model_id = model["model_id"]
    # Create a unique job name
    job_name = f"meetingbank-{model_name}-{datetime.now().strftime('%Y%m%d%H%M%S')}"
    model['job_name'] = job_name

    # Create the evaluation job
    try:
        job_arn = create_evaluation_job(
            job_name=job_name,
            dataset_s3_uri=dataset_s3_uri,
            output_s3_uri=output_s3_uri,
            model_id=model_id,
            region=region
        )
        print(f"Evaluation job created with ARN: {job_arn}")
        model['job_arn'] = job_arn
    except Exception as e:
        print(f"Error creating evaluation job: {e}")
        raise

In [None]:
for model in models:
    job_arn=model['job_arn']
    print(f"name = {model["name"]}. Job ARN: {job_arn}")
    # Wait for the job to complete
    print("Waiting for evaluation job to complete...")
    job_details = wait_for_job_completion(job_arn, region)
    print(f"Job completed with status: {job_details['status']}")

## 6. Download and Analyze Results

In [None]:
results_dir = './results'
os.makedirs(results_dir, exist_ok=True)

for model in models:
    job_name = model['job_name']
    results_local_path = f'{results_dir}/{job_name}'
    # Download the evaluation results
    results_base_dir_s3 = f"{output_s3_uri}/{job_name}/"
    print(f"Downloading results for {model['name']} from {results_base_dir_s3}")

    try:
        download_evaluation_results(results_base_dir_s3, results_local_path, region)
        print(f"Results downloaded to: {results_local_path}")
    except Exception as e:
        print(f"Error downloading results: {e}")

In [None]:
# Analyze the evaluation results
print("Analyzing evaluation results...")
results_df = analyze_evaluation_results(results_dir)

# Display the results
print("\nEvaluation Results:")
print(results_df)

In [None]:
# Visualize the results
print("Creating visualization...")
ax = visualize_evaluation_results(results_df, output_path='evaluation_results.png')
plt.show()

## 7. Conclusion

This notebook demonstrated how to:
1. Load the MeetingBank dataset
2. Prepare the dataset for Bedrock evaluation
3. Create and run a Bedrock evaluation job
4. Analyze and visualize the evaluation results

The evaluation compared Nova models and Claude models on meeting summarization tasks using built-in Bedrock evaluators for Relevance, Correctness, Completeness, and Coherence.