#### Initialization
This part includes all the imports as well as the installation of the modules needed

In [None]:
!python -m pip install tqdm confluent-kafka matplotlib


A bit of utility code to keep the credentials out of the github repo

In [None]:
import functools

@functools.cache
def read_ccloud_config(config_file='client.properties'):
    conf = {}
    with open(config_file) as fh:
        for line in fh:
            line = line.strip()
            if len(line) != 0 and line[0] != "#":
                parameter, value = line.strip().split('=', 1)
                conf[parameter] = value.strip()
    return conf
def read_ccloud_producer_config(config_file='client.properties'):
    conf = read_ccloud_config(config_file)
    omitted_fields = set(['schema.registry.url', 'basic.auth.credentials.source', 'basic.auth.user.info'])
    omitted_prefix = 'confluent'
    for fld in list(conf.keys()):
        if fld in omitted_fields or fld.startswith(omitted_prefix):
            conf.pop(fld, None)
    return conf

A basic Kafka Producer with a simple approach to data rate, payload size, and keys

In [None]:
from confluent_kafka import Producer
from datetime import datetime, timezone
from random import randint
from time import sleep
from tqdm import tqdm

def delivery_callback(err, msg):
    if err:
        print('ERROR: Message failed delivery: {}'.format(err))
    else:
        #print("Produced event to topic {topic}: key = {key:12} value = {value:12}".format(
        #    topic=msg.topic(), key=msg.key().decode('utf-8'), value=msg.value().decode('utf-8')))
        pass

def getMessages(numMessages, msgSize):
    # len is 64
    base_msg = "Upon our honor, we will monitor our data streaming application. "
    for i in range(numMessages):
        yield { 'key': f"mt_key_{randint(1,6)}", 'value': f"{base_msg * (msgSize//len(base_msg))}" }

def publishMessages(load_params):
    startTime = datetime.now(timezone.utc)

    # Simulating extra connections
    producers = []
    for i in range(load_params['num_producers']):
        producers.append(Producer(read_ccloud_producer_config()))
    
    msgSentCount = 0
    numMessages = load_params['msg_rate_per_s'] * load_params['msg_send_duration_s']
    msgSize = load_params['msg_size_bytes']
    progressBar = tqdm(getMessages(numMessages, msgSize), total=numMessages, unit=' messages')
    for msg in progressBar:
        ts = datetime.now(timezone.utc)
        ts_str = ts.isoformat()
        msg['value'] = '{ "payload": "' + msg['value'] + '", "ts": "' + ts_str + '" }'
        producers[msgSentCount % load_params['num_producers']].produce("sale_records", key=msg['key'], value=msg['value'], callback=delivery_callback)
        msgSentCount += 1

        # Rate Limiter
        if not msgSentCount % 100:
            elapsed_seconds = (ts - startTime).total_seconds()
            if msgSentCount >=  elapsed_seconds * load_params['msg_rate_per_s']:
                sleep(msgSentCount/load_params['msg_rate_per_s'] - elapsed_seconds)            

    for producer in producers:
        producer.flush()
    
    endTime = datetime.now(timezone.utc)

    return startTime, endTime

The calls to the Confluent Metrics API to get the metrics we care about. A small bit of poetic license here where cluster_load is only available for Dedicated clusters on the Confluent Cloud. So it is computed here instead; taking into account the various dependent resources.

In [None]:

import json
import urllib.request

MetricsQueries = {
    'received_bytes': {
        'title': 'Ingress',
        'yaxis_title': 'MB',
        'yaxis_limit': 150,
        'scale_factor': 1024*1024,
        'query': {
            'aggregations': [{ 'metric': 'io.confluent.kafka.server/received_bytes'}],
            'group_by': ['metric.topic']
        }
    },
    'sent_bytes': {
        'title': 'Egress',
        'yaxis_title': 'MB',
        'yaxis_limit': 150,
        'scale_factor': 1024*1024,
        'query': {
            'aggregations': [{ 'metric': 'io.confluent.kafka.server/sent_bytes'}],
            'group_by': ['metric.topic']
        }
    },
    'active_connection_count' : {
        'title': 'Active Connection Count',
        'yaxis_title': 'Count',
        'yaxis_limit': 100,
        'query': {
            "aggregations":[{ "metric":"io.confluent.kafka.server/active_connection_count"}]
        }
    },
    'request_count': {
        'title': 'Request Count',
        'yaxis_title': 'Count',
        'query': {
            'aggregations': [{ 'metric': 'io.confluent.kafka.server/request_count'}]
        }
    },
    'cluster_load': {
        'title': 'Cluster Load',
        'yaxis_title': '% Load',
        'yaxis_limit': 100
    }
}
def getMetrics(startTime, endTime):
    conf = read_ccloud_config()
    url = conf['confluent.metrics.endpoint']
    headers = {
        'Authorization': f"Basic {conf['confluent.cloud_api_token']}",
        'Content-Type': 'application/json'
    }
    common = {
        "filter":{"op":"OR","filters":[{"field":"resource.kafka.id","op":"EQ","value":"lkc-v1jq15"}]},
        "granularity":"PT1M",
        "limit":1000
    }
    interval = {
        "intervals":[f"{startTime.isoformat(timespec='seconds')}/{endTime.isoformat(timespec='seconds')}"],
    }

    responses = {}
    for qry in MetricsQueries:
        if 'query' not in MetricsQueries[qry]:
            continue
        data = MetricsQueries[qry]['query'] | common | interval

        req = urllib.request.Request(url, json.dumps(data).encode('utf-8'), headers)
        resp = urllib.request.urlopen(req)
        if resp.getcode() == 200:
            responses[qry] = json.loads(resp.read())
            if 'scale_factor' in MetricsQueries[qry]:
                responses[qry]['data'] = list(map(
                    lambda x: { 
                        'timestamp': x['timestamp'], 
                        'value': x['value']/MetricsQueries[qry]['scale_factor'] 
                    },
                    responses[qry]['data']))
        else:
            print("Error: {resp.getcode()}")

    return responses

def getAllMetrics(startTime, endTime):
    # give the metrics a chance to get done
    # for the ever eager run all in notebook fans
    timeSinceProduce = (datetime.now(timezone.utc) - endTime).total_seconds()
    if  timeSinceProduce < 60:
        sleep(60-timeSinceProduce)
        
    results = getMetrics(startTime, endTime)
    cluster_load_data = []
    # pick any metric to get number of entries since we aggregate per minute
    for i in range(len(results['active_connection_count']['data'])):
        cl_pct = 0
        cl_ts = ""
        for metric in MetricsQueries:
            if 'query' not in MetricsQueries[metric] or i >= len(results[metric]['data']):
                continue
            cl_pct = max(cl_pct, 100.0 * results[metric]['data'][i]['value'] / confluent_max[metric])
            cl_ts = max(cl_ts, results[metric]['data'][i]['timestamp'])
        cluster_load_data.append({ 'timestamp': cl_ts, 'value': cl_pct })
    results['cluster_load'] = { 'data': cluster_load_data }

    return results


Using matplotlib to plot our graphs in a semi-sane way

In [None]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib import axes
%matplotlib inline

from math import ceil


def addPlot(axs, row, col, data, title, yAxisTitle, yAxisLimit, startTime):
    x_values = [(datetime.fromisoformat(el['timestamp']) - startTime).total_seconds() for el in data]
    y_values = [el['value'] for el in data]

    if isinstance(axs[0], axes.Axes):
        axis = axs[col]
    else:
        axis = axs[row,col]

    axis.plot(x_values, y_values, marker='s')
    # Add labels and title
    axis.set_xlabel('Time (s)')

    axis.set_ylabel(yAxisTitle)
    if yAxisLimit:
        axis.set_ybound(lower=0, upper=yAxisLimit)
    else:
        axis.set_ybound(lower=0)
        
    axis.set_title(title)


def plotGraphs(results, graphs, startTime):
    index = 0
    total_plots = len(graphs)
    ncols = 2
    nrows = ceil(total_plots/2)
    fig, axs = plt.subplots(nrows,ncols,sharex=True, figsize=(8, 3*nrows))
    for qry in graphs:
        addPlot(axs, index//ncols, index%ncols, results[qry]['data'],  
                title=MetricsQueries[qry]['title'], 
                yAxisTitle=MetricsQueries[qry]['yaxis_title'],
                yAxisLimit=MetricsQueries[qry].get('yaxis_limit'),
                startTime=startTime)
        index += 1

    fig.tight_layout()
    plt.show()

Definitions for the cluster Limits from https://docs.confluent.io/cloud/current/clusters/cluster-types.html#types-basic-limits-per-cluster on 12/1/2023. Things to note
- The extra *60 is because the metrics are aggregated by the minute
- one of these has had a bit of artistic license applied to simulate an issue

In [None]:
confluent_max = {
    'active_connection_count': 100,
    'received_bytes': 250 * 1024 * 1024 * 60,
    'sent_bytes': 750 * 1024 * 1024 * 60,
    'request_count': 15000 * 60
}


#### The Case Study

##### The original problem reproduced with a 4096 byte message size.

In [None]:
# Load Generation
load_params = {
    'msg_size_bytes': 4096,
    'msg_rate_per_s': 500,
    'msg_send_duration_s': 180,
    'num_producers': 20 
}

startTime, endTime = publishMessages(load_params)
results = getAllMetrics(startTime, endTime)
plotGraphs(results, ['cluster_load', 'received_bytes'], startTime)

##### Now with half the payload

In [None]:
# Load Generation
load_params = {
    'msg_size_bytes': 2048,
    'msg_rate_per_s': 500,
    'msg_send_duration_s': 180,
    'num_producers': 20 
}

startTime, endTime = publishMessages(load_params)
results = getAllMetrics(startTime, endTime)
plotGraphs(results, ['cluster_load', 'received_bytes'], startTime)

##### The final analysis showing what the real factor is causing the high cluster load

In [None]:
plotGraphs(results, ['cluster_load', 'received_bytes', 'active_connection_count', 'request_count'], startTime)

#### The Solution
What does reducing the number of publishers do?

In [None]:
# Load Generation
load_params = {
    'msg_size_bytes': 4096,
    'msg_rate_per_s': 500,
    'msg_send_duration_s': 180,
    'num_producers': 10 
}

startTime, endTime = publishMessages(load_params)
results = getAllMetrics(startTime, endTime)
plotGraphs(results, ['cluster_load', 'received_bytes', 'active_connection_count', 'request_count'], startTime)

#### Helpers for the Puzzle
A few code snippets to help in the investigations.

How to use the AdminClient to query the cluster for the current list of brokers

In [None]:
from confluent_kafka.admin import AdminClient

adminClient = AdminClient(read_ccloud_producer_config()) 
clusterMetadata = adminClient.describe_cluster().result()

for node in clusterMetadata.nodes:
    print(f"id: {node.id}, id_string: {node.id_string}, host: {node.host}, port: {node.port}, rack: {node.rack}")

How to find the current leader broker and ISRs for each partition in the topic

In [None]:
topicsMetadata = adminClient.list_topics()

for topic, topicMetadata in topicsMetadata.topics.items():
    for pid, partition in topicMetadata.partitions.items():
        print(f"Partition id {partition.id}, leader: {partition.leader}, replicas: {partition.replicas}, isrs: {partition.isrs}")    

How to get statistics out of our Producer every second. Add these into the producer code to generate stats every second. 

This code doesn't do anything by itself. It needs to be added to the producer code.

In [None]:
def stats_cb(s):
    j = json.loads(s)
    for m in ['tx_bytes', 'txmsgs']:
        print(f"{m}: {j[m]} ")
        

conf = read_ccloud_producer_config()
conf['stats_cb'] = stats_cb
conf['statistics.interval.ms'] = 1000