## Chunks of code we will need

Install the needed libraries

In [1]:
!python -m pip install confluent-kafka tabulate

Looking in indexes: https://aws:****@confluent-519856050701.d.codeartifact.us-west-2.amazonaws.com/pypi/pypi/simple/


A bit of utility code to keep the credentials out of the github repo. There is an easy startup guide for using Confluent Cloud over at https://developer.confluent.io/get-started/python/. 

In [2]:
import functools

# Read the config file; cache it
@functools.cache
def read_ccloud_config(config_file='client.properties'):
    conf = {}
    with open(config_file) as fh:
        for line in fh:
            line = line.strip()
            if len(line) != 0 and line[0] != "#":
                parameter, value = line.strip().split('=', 1)
                conf[parameter] = value.strip()
    return conf

# Skip non-Kafka client properties
def read_ccloud_producer_config(config_file='client.properties'):
    conf = read_ccloud_config(config_file)
    omitted_fields = set(['schema.registry.url', 'basic.auth.credentials.source', 'basic.auth.user.info'])
    omitted_prefix = 'confluent'
    for fld in list(conf.keys()):
        if fld in omitted_fields or fld.startswith(omitted_prefix):
            conf.pop(fld, None)
    return conf

A basic Kafka Producer with a simple approach to data rate, payload size, and keys
Some documentation:
- The python kafka client library is described at https://docs.confluent.io/kafka-clients/python/current/overview.html
- The metrics delivered via the callback are documented at librdkafka https://github.com/confluentinc/librdkafka/blob/master/STATISTICS.md
- Configuration parameters for the Publisher are also as librdkafka https://github.com/confluentinc/librdkafka/blob/master/CONFIGURATION.md
- Callbacks are at the python client documentation https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#kafka-client-configuration

In [3]:
from confluent_kafka import Producer
from collections import defaultdict
from datetime import datetime, timezone
from random import randint
from time import sleep
import json


def get_stats_cb(results):
    # Accumulating, sort of, the statistics so we can aggregate at the end
    def stats_cb(s):
        j = json.loads(s)
        # Overwriting each time since we just need the last one
        results[j['name']].append(j)
    return stats_cb
        
def get_delivery_callback(latencies):
    def delivery_callback(err, msg):
        if err:
            print('ERROR: Message failed delivery: {}'.format(err))
        else:
            latencies.append(msg.latency())
    return delivery_callback

def getMessages(numMessages, msgSize):
    num_partitions = 6 # Our topic is configured as such
    # len is 64 for the below string
    base_msg = "Upon our honor, we will monitor our data streaming application. "
    for i in range(numMessages):
        yield { 'key': f"mt_key_{i % num_partitions}", 'value': f"{base_msg * (msgSize//len(base_msg))}" }

def publishMessages(load_params):
    startTime = datetime.now(timezone.utc)

    # Simulating extra connections
    conf = read_ccloud_producer_config()
    statistics_interval_ms = 250
    stats = defaultdict(list)
    conf['stats_cb'] = get_stats_cb(stats)
    conf['statistics.interval.ms'] = statistics_interval_ms
    if 'extra_producer_args' in load_params.keys():
        conf.update(load_params['extra_producer_args'])

    producers = [ Producer(conf) for i in range(load_params['num_producers']) ]
    
    msgSentCount = 0
    numMessages = load_params['num_msgs']
    msgSize = load_params['msg_size_bytes']
    msgRateSleepTimeSecs = 1 / load_params['msg_rate_per_s'] 

    latencies = []
    delivery_callback = get_delivery_callback(latencies)

    for msg in getMessages(numMessages, msgSize):
        ts = datetime.now(timezone.utc)
        ts_str = ts.isoformat()
        msg['value'] = '{ "payload": "' + msg['value'] + '", "ts": "' + ts_str + '" }'
        producer_index = msgSentCount % load_params['num_producers']
        producers[producer_index].produce("sale_records", key=msg['key'], value=msg['value'],
                                                       callback=delivery_callback)
        if not msgSentCount % 100:
            for producer in producers:
                producer.poll()
        msgSentCount += 1
        sleep(msgRateSleepTimeSecs)           

    produceEndTime = datetime.now(timezone.utc)

    for producer in producers:
        producer.flush()
    
    endTime = datetime.now(timezone.utc)

    return startTime, produceEndTime, endTime, latencies, stats

The calls to the Confluent Metrics API to get the metrics we care about. 
- Documentation for it is at https://docs.confluent.io/cloud/current/monitoring/metrics-api.html. 
- A complete list of available cluster metrics is at https://api.telemetry.confluent.cloud/docs/descriptors/datasets/cloud

In [4]:
from datetime import datetime, timedelta
import json
import urllib.request

MetricsQueries = {
    'received_bytes': {
        'query': {
            'aggregations': [{ 'metric': 'io.confluent.kafka.server/received_bytes'}],
            'group_by': ['metric.topic']
        }
    },
    'active_connection_count' : {
        'query': { "aggregations":[{ "metric":"io.confluent.kafka.server/active_connection_count"}] }
    },
    'request_count': {
        'query': { 'aggregations': [{ 'metric': 'io.confluent.kafka.server/request_count'}] }
    },
    'received_records': {
        'query': { 'aggregations': [{ 'metric': 'io.confluent.kafka.server/received_records'}]}
    }
}
def getMetrics(startTime, endTime):
    # The Metrics API aggregates by the minute, and throw in clock skew
    sleep(60)
    startTime -= timedelta(seconds=60)
    endTime += timedelta(seconds=60)

    conf = read_ccloud_config()
    url = conf['confluent.metrics.endpoint']
    headers = {
        'Authorization': f"Basic {conf['confluent.cloud_api_token']}",
        'Content-Type': 'application/json'
    }
    common = {
        "filter":{"op":"OR","filters":[{"field":"resource.kafka.id","op":"EQ","value":"lkc-v1jq15"}]},
        "granularity":"PT1M",
        "limit":1000
    }
    interval = {
        "intervals":[f"{startTime.isoformat(timespec='seconds')}/{endTime.isoformat(timespec='seconds')}"],
    }

    responses = {}
    for qry in MetricsQueries:
        data = MetricsQueries[qry]['query'] | common | interval

        req = urllib.request.Request(url, json.dumps(data).encode('utf-8'), headers)
        resp = urllib.request.urlopen(req)
        if resp.getcode() == 200:
            responses[qry] = json.loads(resp.read())
        else:
            print(f"Error: {resp.getcode()}, Request was {json.dumps(data)}")

    return responses


### Simulations

The main code to execute a sample load. Given parameters of the number of producers, the message size, and the number of messages, it sends out all the messages, waits for acknowledgements, collects client side metrics, and calls the metrics api to get back cluster metrics. It also includes a simple tabular print utility.

In [5]:
from statistics import mean
def execute_simulation_app(load_params):

    start, endProduce, end, latencies,stats = publishMessages(load_params)
    print(f"Done in {(end-start).seconds} s Start:{start}, End:{end})")

    result = [
        load_params['num_producers'],
        load_params['num_msgs'],
        load_params['msg_size_bytes'],
        load_params['extra_producer_args']['linger.ms'],
        (endProduce-start).seconds,
        (end-start).seconds,
        int(load_params['num_msgs'] * load_params['msg_size_bytes'] / 
            (end-start).seconds / (1024) ),
        mean(latencies)
    ]
    return result

from tabulate import tabulate
def print_results_app(results):  
    print(tabulate(results, 
                headers=['Producers', 'Messages', 'Message Size (bytes)', 'linger.ms', 'Produce Time (S)', 'Total Time (s)', 'Throughput (KB/s)', 'Avg Latency (micros)'],
                tablefmt='orgtbl'))

#### Linger.ms

In [6]:
# Load Generation
load_params = {
    'num_producers': 1,
    'num_msgs': 1000,
    'msg_rate_per_s': 100, 
    'msg_size_bytes': 4*1024,
    'extra_producer_args': {
        'linger.ms': 0
    }
}

linger_ms_times = [ 0, 10, 100, 1000, 10000 ]

results = []
for linger_ms in linger_ms_times:
    load_params['extra_producer_args']['linger.ms'] = linger_ms
    results.append(execute_simulation_app(load_params))

print_results_app(results)

Done in 120 s Start:2024-02-02 23:25:38.193866+00:00, End:2024-02-02 23:27:38.688618+00:00)
Done in 118 s Start:2024-02-02 23:27:38.696706+00:00, End:2024-02-02 23:29:37.587076+00:00)


%6|1706916649.769|FAIL|rdkafka#producer-3| [thrd:sasl_ssl://b7-pkc-lzvrd.us-west4.gcp.confluent.cloud:9092/7]: sasl_ssl://b7-pkc-lzvrd.us-west4.gcp.confluent.cloud:9092/7: Disconnected (after 70907ms in state UP)


Done in 121 s Start:2024-02-02 23:29:37.593751+00:00, End:2024-02-02 23:31:39.380020+00:00)
Done in 120 s Start:2024-02-02 23:31:39.387398+00:00, End:2024-02-02 23:33:40.355569+00:00)
Done in 122 s Start:2024-02-02 23:33:40.359552+00:00, End:2024-02-02 23:35:43.110320+00:00)
|   Producers |   Messages |   Message Size (bytes) |   linger.ms |   Produce Time (S) |   Total Time (s) |   Throughput (KB/s) |   Avg Latency (micros) |
|-------------+------------+------------------------+-------------+--------------------+------------------+---------------------+------------------------|
|           1 |      10000 |                   4096 |           0 |                120 |              120 |                 333 |               0.716422 |
|           1 |      10000 |                   4096 |          10 |                118 |              118 |                 338 |               0.718877 |
|           1 |      10000 |                   4096 |         100 |                121 |              12

The same code but now using python client library metrics

In [23]:
from statistics import mean
def execute_simulation_app_client(load_params):

    start, endProduce, end, latencies,stats = publishMessages(load_params)
    print(f"Done in {(end-start).seconds} s Start:{start}, End:{end})")

    # Kafka client library metrics
    # Some are aggregated so we only need the last value
    # Some are per metric report, so we need to aggregate them ourselves
    client_metrics = { 
        'num_requests_made' : sum( [ s[-1]['tx'] for s in stats.values() ]),
        'num_messages_sent' : sum( [ s[-1]['txmsgs'] for s in stats.values() ]),
        'num_batch_cnt': sum( t['batchcnt']['cnt'] for s in stats.values() for entry in s for t in entry['topics'].values() ),
        'avg_batch_size_bytes': mean( t['batchsize']['avg'] for s in stats.values() for entry in s for t in entry['topics'].values() )
    }
    
    result = [
        load_params['num_producers'],
        load_params['num_msgs'],
        load_params['msg_size_bytes'],
        load_params['extra_producer_args']['linger.ms'],
        (endProduce-start).seconds,
        (end-start).seconds,
        int(load_params['num_msgs'] * load_params['msg_size_bytes'] / 
            (end-start).seconds / (1024) ),
        mean(latencies),

        client_metrics['num_requests_made'],
        client_metrics['num_requests_made'] / (end-start).seconds ,
        client_metrics['num_messages_sent'],
        client_metrics['num_batch_cnt'],
        client_metrics['avg_batch_size_bytes']
        
    ]
    return result

from tabulate import tabulate
def print_results_app_client(results):  
    print(tabulate(results, 
                headers=['Producers', 'Messages', 'Message Size (bytes)', 'linger.ms', 'Produce Time (S)', 'Total Time (s)', 'Throughput (KB/s)', 'Avg Latency (micros)',
                         'Requests', 'Request Rate', 'Messages Sent', 'Batches Sent', 'Avg Batch Size'],
                tablefmt='orgtbl'))

In [24]:
# Load Generation
load_params = {
    'num_producers': 1,
    'num_msgs': 1000,
    'msg_rate_per_s': 100, 
    'msg_size_bytes': 4*1024,
    'extra_producer_args': {
        'linger.ms': 0
    }
}

linger_ms_times = [ 0, 10, 100, 1000, 10000 ]

results = []
for linger_ms in linger_ms_times:
    load_params['extra_producer_args']['linger.ms'] = linger_ms
    results.append(execute_simulation_app_client(load_params))

print_results_app_client(results)

Done in 12 s Start:2024-02-02 23:48:58.542527+00:00, End:2024-02-02 23:49:11.000761+00:00)
Done in 12 s Start:2024-02-02 23:49:11.005187+00:00, End:2024-02-02 23:49:23.222951+00:00)
Done in 12 s Start:2024-02-02 23:49:23.226505+00:00, End:2024-02-02 23:49:35.653324+00:00)
Done in 12 s Start:2024-02-02 23:49:35.659026+00:00, End:2024-02-02 23:49:48.224725+00:00)
Done in 13 s Start:2024-02-02 23:49:48.229457+00:00, End:2024-02-02 23:50:01.524892+00:00)
|   Producers |   Messages |   Message Size (bytes) |   linger.ms |   Produce Time (S) |   Total Time (s) |   Throughput (KB/s) |   Avg Latency (micros) |   Requests |   Request Rate |   Messages Sent |   Batches Sent |   Avg Batch Size |
|-------------+------------+------------------------+-------------+--------------------+------------------+---------------------+------------------------+------------+----------------+-----------------+----------------+------------------|
|           1 |       1000 |                   4096 |           0 |

Finally, the same code but now including a call to the Confluent Cloud Metrics API for cluster metrics.

In [25]:
from statistics import mean
def execute_simulation_app_client_cluster(load_params):

    start, endProduce, end, latencies,stats = publishMessages(load_params)
    print(f"Done in {(end-start).seconds} s Start:{start}, End:{end})")

    # Kafka client library metrics
    # Some are aggregated so we only need the last value
    # Some are per metric report, so we need to aggregate them ourselves
    client_metrics = { 
        'num_requests_made' : sum( [ s[-1]['tx'] for s in stats.values() ]),
        'num_messages_sent' : sum( [ s[-1]['txmsgs'] for s in stats.values() ]),
        'num_batch_cnt': sum( t['batchcnt']['cnt'] for s in stats.values() for entry in s for t in entry['topics'].values() ),
        'avg_batch_size_bytes': mean( t['batchsize']['avg'] for s in stats.values() for entry in s for t in entry['topics'].values() )
    }
    
    # Metrics from the cluster metrics API
    from_metrics_api = getMetrics(start, end)
    cluster_metrics = {
        'received_bytes': sum([ v['value'] for v in from_metrics_api['received_bytes']['data'] ]),
        'active_connection_count': max([ v['value'] for v in from_metrics_api['active_connection_count']['data'] ]),
        'request_count': sum([ v['value'] for v in from_metrics_api['request_count']['data'] ]),
        'received_records': sum([ v['value'] for v in from_metrics_api['received_records']['data'] ])
    }

    result = [
        load_params['num_producers'],
        load_params['num_msgs'],
        load_params['msg_size_bytes'],
        load_params['extra_producer_args']['linger.ms'],
        (endProduce-start).seconds,
        (end-start).seconds,
        int(load_params['num_msgs'] * load_params['msg_size_bytes'] / 
            (end-start).seconds / (1024) ),
        mean(latencies),

        client_metrics['num_requests_made'],
        client_metrics['num_requests_made'] / (end-start).seconds ,
        client_metrics['num_messages_sent'],
        client_metrics['num_batch_cnt'],
        client_metrics['avg_batch_size_bytes'],

        cluster_metrics['received_bytes'] / (1024),
        cluster_metrics['active_connection_count'],
        cluster_metrics['request_count'],
        cluster_metrics['received_records']
        
    ]
    return result

from tabulate import tabulate
def print_results_app_client_cluster(results):  
    print(tabulate(results, 
                headers=['Producers', 'Messages', 'Message Size (bytes)', 'linger.ms', 'Produce Time (S)', 'Total Time (s)', 'Throughput (KB/s)', 'Avg Latency (micros)',
                         'Requests', 'Request Rate', 'Messages Sent', 'Batches Sent', 'Avg Batch Size',
                         'ingress (MB)','connections', 'Requests', 'Messages'],
                tablefmt='orgtbl'))

In [26]:
# Load Generation
load_params = {
    'num_producers': 1,
    'num_msgs': 10000,
    'msg_rate_per_s': 100, 
    'msg_size_bytes': 4*1024,
    'extra_producer_args': {
        'linger.ms': 0
    }
}

linger_ms_times = [ 0, 10, 100, 1000, 10000 ]

results = []
for linger_ms in linger_ms_times:
    load_params['extra_producer_args']['linger.ms'] = linger_ms
    results.append(execute_simulation_app_client_cluster(load_params))
    sleep(120) # Cluster metrics are aggregated to the minute and we want to avoid overlap

print_results_app_client_cluster(results)

Done in 121 s Start:2024-02-02 23:54:49.324021+00:00, End:2024-02-02 23:56:50.436093+00:00)
Done in 119 s Start:2024-02-02 23:59:53.066745+00:00, End:2024-02-03 00:01:52.793785+00:00)
Done in 120 s Start:2024-02-03 00:04:55.320939+00:00, End:2024-02-03 00:06:56.046369+00:00)
Done in 121 s Start:2024-02-03 00:09:58.626834+00:00, End:2024-02-03 00:12:00.281963+00:00)
Done in 123 s Start:2024-02-03 00:15:03.170053+00:00, End:2024-02-03 00:17:06.987051+00:00)
|   Producers |   Messages |   Message Size (bytes) |   linger.ms |   Produce Time (S) |   Total Time (s) |   Throughput (KB/s) |   Avg Latency (micros) |   Requests |   Request Rate |   Messages Sent |   Batches Sent |   Avg Batch Size |   ingress (MB) |   connections |   Requests |   Messages |
|-------------+------------+------------------------+-------------+--------------------+------------------+---------------------+------------------------+------------+----------------+-----------------+----------------+------------------+----

#### Requests

Still bothered by client side requests != cluster request_count. Lets turn debugging on for a small test.

In [28]:
import logging
logger = logging.getLogger()
logger.addHandler(logging.FileHandler(filename='publisher.log'))
logger.setLevel(logging.DEBUG)

# Load Generation
load_params = {
    'num_producers': 1,
    'num_msgs': 1000,
    'msg_rate_per_s': 100, 
    'msg_size_bytes': 4*1024,
    'extra_producer_args': {
        'linger.ms': 0,
        'debug': 'all',
        'logger': logger
    }
}

results = []
results.append(execute_simulation_app_client_cluster(load_params))

print_results_app_client_cluster(results)

Done in 12 s Start:2024-02-03 00:43:01.814426+00:00, End:2024-02-03 00:43:13.986518+00:00)
|   Producers |   Messages |   Message Size (bytes) |   linger.ms |   Produce Time (S) |   Total Time (s) |   Throughput (KB/s) |   Avg Latency (micros) |   Requests |   Request Rate |   Messages Sent |   Batches Sent |   Avg Batch Size |   ingress (MB) |   connections |   Requests |   Messages |
|-------------+------------+------------------------+-------------+--------------------+------------------+---------------------+------------------------+------------+----------------+-----------------+----------------+------------------+----------------+---------------+------------+------------|
|           1 |       1000 |                   4096 |           0 |                 12 |               12 |                 333 |               0.823122 |        902 |        75.1667 |             990 |            862 |          7215.74 |        4126.28 |             1 |        874 |       1000 |


#### Connections

Trying out the number of connections by changing the number of producers

In [30]:
# Load Generation
load_params = {
    'num_producers': 1,
    'num_msgs': 20000,
    'msg_rate_per_s': 100000, 
    'msg_size_bytes': 2*1024,
    'extra_producer_args': {
        'linger.ms': 0
    }
}

num_producers = [ 1, 2, 10, 20 ]

results = []
for producers in num_producers:
    load_params['num_producers'] = producers
    results.append(execute_simulation_app_client_cluster(load_params))
    sleep(120) # Wait to make sure there is no overlap in cluster metrics


print_results_app_client_cluster(results)

Done in 26 s Start:2024-02-03 01:23:20.433946+00:00, End:2024-02-03 01:23:47.012338+00:00)
Done in 26 s Start:2024-02-03 01:26:49.505427+00:00, End:2024-02-03 01:27:15.822420+00:00)
Done in 11 s Start:2024-02-03 01:30:18.404505+00:00, End:2024-02-03 01:30:29.719345+00:00)
Done in 14 s Start:2024-02-03 01:33:32.825905+00:00, End:2024-02-03 01:33:47.565507+00:00)
|   Producers |   Messages |   Message Size (bytes) |   linger.ms |   Produce Time (S) |   Total Time (s) |   Throughput (KB/s) |   Avg Latency (micros) |   Requests |   Request Rate |   Messages Sent |   Batches Sent |   Avg Batch Size |   ingress (MB) |   connections |   Requests |   Messages |
|-------------+------------+------------------------+-------------+--------------------+------------------+---------------------+------------------------+------------+----------------+-----------------+----------------+------------------+----------------+---------------+------------+------------|
|           1 |      20000 |            