In [None]:
!pip install --upgrade boto3 awscli matplotlib numpy pandas anthropic

# Benchmark latency for Amazon Bedrock models
Notes:
1. This benchmark tests can test for either complete responses or streaming responses.
2. Latency will possibly be lower when using provisioned throughput (currently using on-demand). See: TODO
3. Using boto3 Bedrock API.

Test Amazon Bedrock setup

# TODO add configuration section
-- Streaming | complete responses
-- Boto3 retries - should be 0 normally
-- models to test
-- Input tokens
-- Output toekns

In [2]:
import boto3, botocore
bedrock = boto3.client(service_name='bedrock-runtime', 
                       config=botocore.config.Config(retries=dict(max_attempts=0))) # prevent auto retries so we're measuring a single transcation
#print(bedrock.list_foundation_models())
#bedrock.get_foundation_model(modelIdentifier='anthropic.claude-v2')

In [3]:
import random 
import anthropic

client = anthropic.Anthropic() # used to count tokens only

# This prompt should include arbitrary long input and generate arbitrary long output
def _get_prompt_template(num_input_tokens):
    tokens = 'Human:'
    tokens += 'Ignore X' + '<X>'
    for i in range(num_input_tokens-1):
        tokens += random.choice(['hello', 'world', 'foo', 'bar']) + ' '
    tokens += '</X>'
    tokens += "print numbers 1 to 9999 as words. don't omit for brevity"
    tokens += '\n\nAssistant:one two'  # model will continue with " three four five..."
    return tokens


# This method will return a prompt template with the given expected_num_tokens that cause the model to generate up to 10K tokens in response
def get_text_tokens(expected_num_tokens):
    num_tokens_in_prompt_template = client.count_tokens(_get_prompt_template(0))
    additional_tokens_needed = max(expected_num_tokens - num_tokens_in_prompt_template,0)
    
    prompt_template = _get_prompt_template(additional_tokens_needed)
    
    actual_num_tokens = client.count_tokens(prompt_template)
    #print(f'expected_num_tokens={expected_num_tokens}, actual_tokens={actual_num_tokens}')
    assert expected_num_tokens==actual_num_tokens, f'Failed to generate prompt at required length: expected_num_tokens{expected_num_tokens} != actual_num_tokens={actual_num_tokens}'
    
    return prompt_template

In [4]:
#print(get_text_tokens(39))

In [5]:
import time, json
from botocore.exceptions import ClientError
sleep_on_throttling_sec = 5

def benchmark(bedrock, prompt, max_tokens_to_sample, stream=True, temprature=0):
    modelId = 'anthropic.claude-v2'
    accept = 'application/json'
    contentType = 'application/json'
    
    body = json.dumps({
    "prompt": prompt,
    "max_tokens_to_sample": max_tokens_to_sample,
    "temperature": 0,
})
    while True:
        try:
            start = time.time()

            if stream:
                response = bedrock.invoke_model_with_response_stream(body=body, modelId=modelId, accept=accept, contentType=contentType)
            else:
                response = bedrock.invoke_model(body=body, modelId=modelId, accept=accept, contentType=contentType)
            #print(response)
            
            first_byte = None
            if stream:
                event_stream = response.get('body')
                for event in event_stream:
                    chunk = event.get('chunk')
                    if chunk:
                        if not first_byte:
                            first_byte = time.time() # update the time to first byte
                        #print(f'chunk:\n {json.loads(chunk.get('bytes').decode())}')
                # end of stream - check stop_reson in last chunk
                stop_reason = json.loads(chunk.get('bytes').decode())['stop_reason']    
                last_byte = time.time()
            else:
                #no streaming flow
                first_byte = time.time()
                last_byte = first_byte
                response_body = json.loads(response.get('body').read())
                stop_reason = response_body['stop_reason']

            
            # verify we got all of the intended output tokens
            assert stop_reason == 'max_tokens', f"stop_reason is {stop_reason} instead of 'max_tokens', this means the model generated less tokens than required."

            duration_to_first_byte = first_byte - start
            duration_to_last_byte = last_byte - start
        except ClientError as err:
            if 'Thrott' in err.response['Error']['Code']:
                print(f'Got ThrottlingException. Sleeping {sleep_on_throttling_sec} sec and retrying.')
                time.sleep(sleep_on_throttling_sec)
                continue
            raise err
        break
    return duration_to_first_byte, duration_to_last_byte

In [7]:
benchmark(bedrock, get_text_tokens(50), 100, stream=True)

(0.7483198642730713, 4.039894104003906)

In [8]:
import json
import pprint
pp = pprint.PrettyPrinter(indent=2)

early_break = False
num_in_tokens_to_test = (50, 200, 1000, 2000, 4000, 8000, 16000, 32000, 64000, 100_000)
num_out_tokens_to_test = (50, 200, 1000, 2000, 4000, 8191)
test_scenarios = list()
for out_tokens in num_out_tokens_to_test:
    for in_tokens in num_in_tokens_to_test:
        test_scenarios.append(
            {
                'in_tokens' : in_tokens,
                'out_tokens' : out_tokens,
                'name' : f'in={in_tokens}, out={out_tokens}',
            }
        )
test_scenarios     

[{'in_tokens': 50, 'out_tokens': 50, 'name': 'in=50, out=50'},
 {'in_tokens': 200, 'out_tokens': 50, 'name': 'in=200, out=50'},
 {'in_tokens': 1000, 'out_tokens': 50, 'name': 'in=1000, out=50'},
 {'in_tokens': 2000, 'out_tokens': 50, 'name': 'in=2000, out=50'},
 {'in_tokens': 4000, 'out_tokens': 50, 'name': 'in=4000, out=50'},
 {'in_tokens': 8000, 'out_tokens': 50, 'name': 'in=8000, out=50'},
 {'in_tokens': 16000, 'out_tokens': 50, 'name': 'in=16000, out=50'},
 {'in_tokens': 32000, 'out_tokens': 50, 'name': 'in=32000, out=50'},
 {'in_tokens': 64000, 'out_tokens': 50, 'name': 'in=64000, out=50'},
 {'in_tokens': 100000, 'out_tokens': 50, 'name': 'in=100000, out=50'},
 {'in_tokens': 50, 'out_tokens': 200, 'name': 'in=50, out=200'},
 {'in_tokens': 200, 'out_tokens': 200, 'name': 'in=200, out=200'},
 {'in_tokens': 1000, 'out_tokens': 200, 'name': 'in=1000, out=200'},
 {'in_tokens': 2000, 'out_tokens': 200, 'name': 'in=2000, out=200'},
 {'in_tokens': 4000, 'out_tokens': 200, 'name': 'in=4000

In [None]:
for scenario in test_scenarios:
    for i in range(3): # increase to sample each use case more than once to discover jitter
        try:
            prompt = get_text_tokens(scenario['in_tokens'])
            ttfb,ttlb = benchmark(bedrock, prompt, scenario['out_tokens'])

            if 'durations' not in scenario: scenario['durations'] = list()
            duration = {
                'time-to-first-byte-seconds':  ttfb,
                'time-to-last-byte-seconds':  ttlb,
            }
            scenario['durations'].append(duration)

            print(f"Scenario: [{scenario['name']}, " + 
                  f'Duration: {pp.pformat((duration))}')
        except Exception as e:
            print(e)
            print(f"Error while processing scenario: {scenario['name']}.")
        if early_break:
            break

In [None]:
pp.pprint(test_scenarios)
