From 9b13071fbefef8d1a1d723de93538e5d8818519c Mon Sep 17 00:00:00 2001 From: hvanz Date: Thu, 21 Mar 2024 11:52:08 +0100 Subject: [PATCH 1/9] Improve prometheus_plotter.py --- docs/references/qa/method.md | 8 +- scripts/qa/reporting/README.md | 27 +- scripts/qa/reporting/prometheus_plotter.py | 278 ++++++++++----------- 3 files changed, 153 insertions(+), 160 deletions(-) diff --git a/docs/references/qa/method.md b/docs/references/qa/method.md index 4127473fe5..e55ba5d947 100644 --- a/docs/references/qa/method.md +++ b/docs/references/qa/method.md @@ -159,12 +159,14 @@ The CometBFT team should improve it at every iteration to increase the amount of [`latency_throughput.py`]: ../../../scripts/qa/reporting/README.md#Latency-vs-Throughput-Plotting [`latency_plotter.py`]: ../../../scripts/qa/reporting/README.md#Latency-vs-Throughput-Plotting-version-2 -#### Extracting Prometheus Metrics +#### Extract Prometheus Metrics 1. Stop the prometheus server if it is running as a service (e.g. a `systemd` unit). -2. Unzip the prometheus database retrieved from the testnet, and move it to replace the - local prometheus database. +2. Unzip the prometheus database retrieved from the testnet. 3. Start the prometheus server and make sure no error logs appear at start up. + ```bash + prometheus --storage.tsdb.path=path/to/prometheus/data --config.file=path/to/prometheus.yml + ``` 4. Identify the time window you want to plot in your graphs. 5. Execute the [`prometheus_plotter.py`] script for the time window. diff --git a/scripts/qa/reporting/README.md b/scripts/qa/reporting/README.md index d8598e2214..1bd0067a6b 100644 --- a/scripts/qa/reporting/README.md +++ b/scripts/qa/reporting/README.md @@ -75,23 +75,16 @@ python3 latency_plotter.py /path/to/csv/files/raw.csv ## Prometheus metrics -1. Ensure that Prometheus is running locally and listening on port 9090. -2. Tweak the script to your needs - 1. Adjust the time window - 2. Select the right fork - 3. Select the right test case - 4. Tweak/add/remove metrics -3. Run the script as follows - ```bash - # Do the following while ensuring that the virtual environment is activated (see - # the Setup steps). - # - # This will generate a series of plots in the folder `imgs` of the current folder. - - mkdir imgs - python3 prometheus_plotter.py - ``` -4. Plots are saved in the `imgs` folder. +The `prometheus_plotter.py` script generates a series of plots in the folder `imgs` of the current folder. + +Before running the script, check that a Prometheus server in `localhost:9090`. This is the default URL hardcoded in the script. + +Run the script from the virtual environment as follows: +```bash +python3 prometheus_plotter.py +``` + +For details and examples of how to run the script, just run `python3 prometheus_plotter.py` [matplotlib]: https://matplotlib.org/ [pandas]: https://pandas.pydata.org diff --git a/scripts/qa/reporting/prometheus_plotter.py b/scripts/qa/reporting/prometheus_plotter.py index fbc62050f8..1c5e6bcc4d 100644 --- a/scripts/qa/reporting/prometheus_plotter.py +++ b/scripts/qa/reporting/prometheus_plotter.py @@ -1,151 +1,149 @@ # pip install numpy pandas matplotlib requests - -import sys import os +import requests +import sys import matplotlib as mpl import matplotlib.pyplot as plt import matplotlib.dates as md - import numpy as np import pandas as pd -import requests from urllib.parse import urljoin - -from prometheus_pandas import query - -#release = 'v0.37.0-alpha.2' -release = 'v0.38.0-alpha.2' -path = os.path.join('imgs') -prometheus = query.Prometheus('http://localhost:9090') - -# Time window -#window_size = dict(seconds=150) #CMT 0.37.x-alpha3 -#window_size = dict(seconds=126) #TM v0.37 (200 nodes) baseline -#window_size = dict(hours=1, minutes=28, seconds=25) #TM v0.37.0-alpha.2 (rotating) -#window_size = dict(seconds=130) #homogeneous -#window_size = dict(seconds=127) #baseline -#window_size = dict(seconds=115) #CMT v0.38.0-alpha.2 (200 nodes) -#window_size = dict(hours=1, minutes=46) #CMT v0.38.0-alpha.2 (rotating) -window_size = dict(seconds=150) #CMT v0.38.0-alpha.2 (ve baseline) - -ext_window_size = dict(seconds=200) - -# Use the time provided by latency_plotter for the selected experiment. -#left_end = '2023-02-08T13:12:20Z' #cmt2 tm1 -#left_end = '2023-02-08T10:31:50Z' #cmt1 tm2 -#left_end = '2023-02-14T15:18:00Z' #cmt1 tm1 -#left_end = '2023-02-07T18:07:00Z' #homogeneous -#left_end = '2022-10-13T19:41:23Z' #baseline -#left_end = '2023-02-22T18:56:29Z' #CMT v0.37.x-alpha3 -#left_end = '2022-10-13T15:57:50Z' #TM v0.37 (200 nodes) baseline -#left_end = '2023-03-20T19:45:35Z' #feature/abci++vef merged with main (7d8c9d426) -#left_end = '2023-05-22T09:39:20Z' #CMT v0.38.0-alpha.2 - 200 nodes -#left_end = '2022-10-10T15:47:15Z' #TM v0.37.0-alpha.2 - rotating -#left_end = '2023-05-23T08:09:50Z' #CMT v0.38.0-alpha.2 - rotating - -#left_end = '2023-05-25T18:18:04Z' #CMT v0.38.0-alpha.2 - ve baseline -#left_end = '2023-05-30T19:05:32Z' #CMT v0.38.0-alpha.2 - ve 2k -left_end = '2023-05-30T20:44:46Z' #CMT v0.38.0-alpha.2 - ve 4k -#left_end = '2023-05-25T19:42:08Z' #CMT v0.38.0-alpha.2 - ve 8k -#left_end = '2023-05-26T00:28:12Z' #CMT v0.38.0-alpha.2 - ve 16k -#left_end = '2023-05-26T02:12:27Z' #CMT v0.38.0-alpha.2 - ve 32k - -useManualrightEnd = False -if useManualrightEnd: - #right_end = '2023-05-25T18:54:04Z' #CMT v0.38.0-alpha.2 - ve baseline - #right_end = '2023-05-30T19:40:41Z' #CMT v0.38.0-alpha.2 - ve 2k - right_end = '2023-05-30T21:15:37Z' #CMT v0.38.0-alpha.2 - ve 4k - #right_end = '2023-05-25T20:16:00Z' #CMT v0.38.0-alpha.2 - ve 8k - #right_end = '2023-05-26T01:01:57Z' #CMT v0.38.0-alpha.2 - ve 16k - #right_end = '2023-05-26T02:46:19Z' #CMT v0.38.0-alpha.2 - ve 32k - time_window = (left_end, right_end) -else: - right_end = pd.to_datetime(left_end) + pd.Timedelta(**window_size) - time_window = (left_end, right_end.strftime('%Y-%m-%dT%H:%M:%SZ')) - -ext_right_end = pd.to_datetime(left_end) + pd.Timedelta(**ext_window_size) -ext_time_window = (left_end, ext_right_end.strftime('%Y-%m-%dT%H:%M:%SZ')) - - -fork='cometbft' -#fork='tendermint' - -# Do prometheus queries, depending on the test case -queries200Nodes = [ - (( fork + '_mempool_size', time_window[0], time_window[1], '1s'), 'mempool_size', dict(ylabel='TXs', xlabel='time (s)', title='Mempool Size', legend=False, figsize=(10,6), grid=True, kind='area',stacked=True), False), - (( fork + '_p2p_peers', time_window[0], time_window[1], '1s'), 'peers', dict(ylabel='# Peers', xlabel='time (s)', title='Peers', legend=False, figsize=(10,6), grid=True), True), - (( 'avg(' + fork + '_mempool_size)', time_window[0], time_window[1], '1s'), 'avg_mempool_size', dict(ylabel='TXs', xlabel='time (s)', title='Average Mempool Size', legend=False, figsize=(10,6), grid=True), False), - #(( 'cometbft_consensus_height', time_window[0], time_window[1], '1s'), 'blocks_regular', dict(ylabel='# Blocks', xlabel='time (s)', title='Blocks in time', legend=False, figsize=(10,6), grid=True), False), - (( fork + '_consensus_rounds', time_window[0], time_window[1], '1s'), 'rounds', dict(ylabel='# Rounds', xlabel='time (s)', title='Rounds per block', legend=False, figsize=(10,6), grid=True), False), - (( 'rate(' + fork + '_consensus_height[20s])*60', time_window[0], time_window[1], '1s'), 'block_rate_regular', dict(ylabel='Blocks/min', xlabel='time (s)', title='Rate of block creation', legend=False, figsize=(10,6), grid=True), True), - #(( 'avg(rate(cometbft_consensus_height[20s])*60)', time_window[0], time_window[1], '1s'), 'block_rate_avg_reg', dict(ylabel='Blocks/min', xlabel='time (s)', title='Rate of block creation', legend=False, figsize=(10,6), grid=True), False), - #(( 'cometbft_consensus_total_txs', time_window[0], time_window[1], '1s'), 'total_txs_regular', dict(ylabel='# TXs', xlabel='time (s)', title='Transactions in time', legend=False, figsize=(10,6), grid=True), False), - (( 'rate(' + fork + '_consensus_total_txs[20s])*60', time_window[0], time_window[1], '1s'), 'total_txs_rate_regular', dict(ylabel='TXs/min', xlabel='time (s)', title='Rate of transaction processing', legend=False, figsize=(10,6), grid=True), True), - #(( 'avg(rate(cometbft_consensus_total_txs[20s])*60)', time_window[0], time_window[1], '1s'), 'total_txs_rate_avg_reg', dict(ylabel='TXs/min', xlabel='time (s)', title='Rate of transaction processing', legend=False, figsize=(10,6), grid=True), False), - (( 'process_resident_memory_bytes', time_window[0], time_window[1], '1s'), 'memory', dict(ylabel='Memory (bytes)', xlabel='time (s)', title='Memory usage', legend=False, figsize=(10,6), grid=True), False), - (( 'avg(process_resident_memory_bytes)', time_window[0], time_window[1], '1s'), 'avg_memory', dict(ylabel='Memory (bytes)', xlabel='time (s)', title='Average Memory usage', legend=False, figsize=(10,6), grid=True), False), - (( 'node_load1', time_window[0], time_window[1], '1s'), 'cpu', dict(ylabel='Load', xlabel='time (s)', title='Node load', legend=False, figsize=(10,6), grid=True), False), - (( 'avg(node_load1)', time_window[0], time_window[1], '1s'), 'avg_cpu', dict(ylabel='Load', xlabel='time (s)', title='Average Node load', legend=False, figsize=(10,6), grid=True), False), - #extended window metrics - (( fork + '_consensus_height', ext_time_window[0], ext_time_window[1], '1s'), 'blocks', dict(ylabel='# Blocks', xlabel='time (s)', title='Blocks in time', legend=False, figsize=(10,6), grid=True), False), - (( 'rate(' + fork + '_consensus_height[20s])*60', ext_time_window[0], ext_time_window[1], '1s'), 'block_rate', dict(ylabel='Blocks/min', xlabel='time (s)', title='Rate of block creation', legend=False, figsize=(10,6), grid=True), True), - (( fork + '_consensus_total_txs', ext_time_window[0], ext_time_window[1], '1s'), 'total_txs', dict(ylabel='# TXs', xlabel='time (s)', title='Transactions in time', legend=False, figsize=(10,6), grid=True), False), - (( 'rate(' + fork + '_consensus_total_txs[20s])*60', ext_time_window[0], ext_time_window[1], '1s'), 'total_txs_rate', dict(ylabel='TXs/min', xlabel='time (s)', title='Rate of transaction processing', legend=False, figsize=(10,6), grid=True), True), -] - -queriesRotating = [ - (( 'rate(' + fork + '_consensus_height[20s])*60', time_window[0], time_window[1], '1s'), 'rotating_block_rate', dict(ylabel='blocks/min', xlabel='time', title='Rate of Block Creation', legend=False, figsize=(10,6), grid=True), False), - (( 'rate(' + fork + '_consensus_total_txs[20s])*60', time_window[0], time_window[1], '1s'), 'rotating_txs_rate', dict(ylabel='TXs/min', xlabel='time', title='Rate of Transaction processing', legend=False, figsize=(10,6), grid=True), False), - (( fork + '_consensus_height{job=~"ephemeral.*"} or ' + fork + '_blocksync_latest_block_height{job=~"ephemeral.*"}', +from prometheus_pandas import query as prometheus_query + + +PROMETHEUS_URL = 'http://localhost:9090' +IMAGES_DIR = 'imgs' +TEST_CASES = ['200_nodes', 'rotating', 'vote_extensions'] + + +def usage(): + print("Usage:") + print(f"\t{sys.argv[0]} release_name start_time window_size test_case") + print("where:") + print(f"- start_time is a UTF time in '%Y-%m-%dT%H:%M:%SZ' format") + print(f"- window size is in seconds") + print(f"- test_case is one of {TEST_CASES}") + print(f"Example: \t{sys.argv[0]} v1.0.0-alpha.2 2024-03-21T08:45:23Z 180 200_nodes") + exit(1) + + +def queries_200_nodes(time_window, ext_time_window): + return [ + (( 'cometbft_mempool_size', time_window[0], time_window[1], '1s'), 'mempool_size', dict(ylabel='TXs', xlabel='time (s)', title='Mempool Size', legend=False, figsize=(10,6), grid=True, kind='area',stacked=True), False), + (( 'cometbft_p2p_peers', time_window[0], time_window[1], '1s'), 'peers', dict(ylabel='# Peers', xlabel='time (s)', title='Peers', legend=False, figsize=(10,6), grid=True), True), + (( 'avg(cometbft_mempool_size)', time_window[0], time_window[1], '1s'), 'avg_mempool_size', dict(ylabel='TXs', xlabel='time (s)', title='Average Mempool Size', legend=False, figsize=(10,6), grid=True), False), + #(( 'cometbft_consensus_height', time_window[0], time_window[1], '1s'), 'blocks_regular', dict(ylabel='# Blocks', xlabel='time (s)', title='Blocks in time', legend=False, figsize=(10,6), grid=True), False), + (( 'cometbft_consensus_rounds', time_window[0], time_window[1], '1s'), 'rounds', dict(ylabel='# Rounds', xlabel='time (s)', title='Rounds per block', legend=False, figsize=(10,6), grid=True), False), + (( 'rate(cometbft_consensus_height[20s])*60', time_window[0], time_window[1], '1s'), 'block_rate_regular', dict(ylabel='Blocks/min', xlabel='time (s)', title='Rate of block creation', legend=False, figsize=(10,6), grid=True), True), + #(( 'avg(rate(cometbft_consensus_height[20s])*60)', time_window[0], time_window[1], '1s'), 'block_rate_avg_reg', dict(ylabel='Blocks/min', xlabel='time (s)', title='Rate of block creation', legend=False, figsize=(10,6), grid=True), False), + #(( 'cometbft_consensus_total_txs', time_window[0], time_window[1], '1s'), 'total_txs_regular', dict(ylabel='# TXs', xlabel='time (s)', title='Transactions in time', legend=False, figsize=(10,6), grid=True), False), + (( 'rate(cometbft_consensus_total_txs[20s])*60', time_window[0], time_window[1], '1s'), 'total_txs_rate_regular', dict(ylabel='TXs/min', xlabel='time (s)', title='Rate of transaction processing', legend=False, figsize=(10,6), grid=True), True), + #(( 'avg(rate(cometbft_consensus_total_txs[20s])*60)', time_window[0], time_window[1], '1s'), 'total_txs_rate_avg_reg', dict(ylabel='TXs/min', xlabel='time (s)', title='Rate of transaction processing', legend=False, figsize=(10,6), grid=True), False), + (( 'process_resident_memory_bytes', time_window[0], time_window[1], '1s'), 'memory', dict(ylabel='Memory (bytes)', xlabel='time (s)', title='Memory usage', legend=False, figsize=(10,6), grid=True), False), + (( 'avg(process_resident_memory_bytes)', time_window[0], time_window[1], '1s'), 'avg_memory', dict(ylabel='Memory (bytes)', xlabel='time (s)', title='Average Memory usage', legend=False, figsize=(10,6), grid=True), False), + (( 'node_load1', time_window[0], time_window[1], '1s'), 'cpu', dict(ylabel='Load', xlabel='time (s)', title='Node load', legend=False, figsize=(10,6), grid=True), False), + (( 'avg(node_load1)', time_window[0], time_window[1], '1s'), 'avg_cpu', dict(ylabel='Load', xlabel='time (s)', title='Average Node load', legend=False, figsize=(10,6), grid=True), False), + + # Extended window metrics + (( 'cometbft_consensus_height', ext_time_window[0], ext_time_window[1], '1s'), 'blocks', dict(ylabel='# Blocks', xlabel='time (s)', title='Blocks in time', legend=False, figsize=(10,6), grid=True), False), + (( 'rate(cometbft_consensus_height[20s])*60', ext_time_window[0], ext_time_window[1], '1s'), 'block_rate', dict(ylabel='Blocks/min', xlabel='time (s)', title='Rate of block creation', legend=False, figsize=(10,6), grid=True), True), + (( 'cometbft_consensus_total_txs', ext_time_window[0], ext_time_window[1], '1s'), 'total_txs', dict(ylabel='# TXs', xlabel='time (s)', title='Transactions in time', legend=False, figsize=(10,6), grid=True), False), + (( 'rate(cometbft_consensus_total_txs[20s])*60', ext_time_window[0], ext_time_window[1], '1s'), 'total_txs_rate', dict(ylabel='TXs/min', xlabel='time (s)', title='Rate of transaction processing', legend=False, figsize=(10,6), grid=True), True), + ] + + +def queries_rotating(time_window): + return [ + (( 'rate(cometbft_consensus_height[20s])*60', time_window[0], time_window[1], '1s'), 'rotating_block_rate', dict(ylabel='blocks/min', xlabel='time', title='Rate of Block Creation', legend=False, figsize=(10,6), grid=True), False), + (( 'rate(cometbft_consensus_total_txs[20s])*60', time_window[0], time_window[1], '1s'), 'rotating_txs_rate', dict(ylabel='TXs/min', xlabel='time', title='Rate of Transaction processing', legend=False, figsize=(10,6), grid=True), False), + (( 'cometbft_consensus_height{job=~"ephemeral.*"} or cometbft_blocksync_latest_block_height{job=~"ephemeral.*"}', time_window[0], time_window[1], '1s'), 'rotating_eph_heights', dict(ylabel='height', xlabel='time', title='Heights of Ephemeral Nodes', legend=False, figsize=(10,6), grid=True), False), - (( fork + '_p2p_peers', time_window[0], time_window[1], '1s'), 'rotating_peers', dict(ylabel='# peers', xlabel='time', title='Peers', legend=False, figsize=(10,6), grid=True), False), - (( 'avg(process_resident_memory_bytes)', time_window[0], time_window[1], '1s'), 'rotating_avg_memory', dict(ylabel='memory (bytes)', xlabel='time', title='Average Memory Usage', legend=False, figsize=(10,6), grid=True), False), - (( 'node_load1', time_window[0], time_window[1], '1s'), 'rotating_cpu', dict(ylabel='load', xlabel='time', title='Node Load', legend=False, figsize=(10,6), grid=True), False), -] - -queriesVExtension= [ - (( fork + '_mempool_size', time_window[0], time_window[1], '1s'), 'mempool_size', dict(ylabel='TXs', xlabel='time (s)', title='Mempool Size', legend=False, figsize=(10,6), grid=True, kind='area',stacked=True), False), - (( fork + '_mempool_size', time_window[0], time_window[1], '1s'), 'mempool_size_not_stacked', dict(ylabel='TXs', xlabel='time (s)', title='Mempool Size', legend=False, figsize=(10,6), grid=True, stacked=False), False), - (( fork + '_p2p_peers', time_window[0], time_window[1], '1s'), 'peers', dict(ylabel='# Peers', xlabel='time (s)', title='Peers', legend=False, figsize=(10,6), grid=True), True), - (( 'avg(' + fork + '_mempool_size)', time_window[0], time_window[1], '1s'), 'avg_mempool_size', dict(ylabel='TXs', xlabel='time (s)', title='Average Mempool Size', legend=False, figsize=(10,6), grid=True), False), - (( fork + '_consensus_rounds', time_window[0], time_window[1], '1s'), 'rounds', dict(ylabel='# Rounds', xlabel='time (s)', title='Rounds per block', legend=False, figsize=(10,6), grid=True), False), - (( 'process_resident_memory_bytes', time_window[0], time_window[1], '1s'), 'memory', dict(ylabel='Memory (bytes)', xlabel='time (s)', title='Memory usage', legend=False, figsize=(10,6), grid=True), False), - (( 'avg(process_resident_memory_bytes)', time_window[0], time_window[1], '1s'), 'avg_memory', dict(ylabel='Memory (bytes)', xlabel='time (s)', title='Average Memory usage', legend=False, figsize=(10,6), grid=True), False), - (( 'node_load1', time_window[0], time_window[1], '1s'), 'cpu', dict(ylabel='Load', xlabel='time (s)', title='Node load', legend=False, figsize=(10,6), grid=True), False), - (( 'avg(node_load1)', time_window[0], time_window[1], '1s'), 'avg_cpu', dict(ylabel='Load', xlabel='time (s)', title='Average Node load', legend=False, figsize=(10,6), grid=True), False), - (( fork + '_consensus_height', time_window[0], time_window[1], '1s'), 'blocks', dict(ylabel='# Blocks', xlabel='time (s)', title='Blocks in time', legend=False, figsize=(10,6), grid=True), False), - (( 'rate(' + fork + '_consensus_height[20s])*60', time_window[0], time_window[1], '1s'), 'block_rate', dict(ylabel='Blocks/min', xlabel='time (s)', title='Rate of block creation', legend=False, figsize=(10,6), grid=True), True), - (( fork + '_consensus_total_txs', time_window[0], time_window[1], '1s'), 'total_txs', dict(ylabel='# TXs', xlabel='time (s)', title='Transactions in time', legend=False, figsize=(10,6), grid=True), False), - (( 'rate(' + fork + '_consensus_total_txs[20s])*60', time_window[0], time_window[1], '1s'), 'total_txs_rate', dict(ylabel='TXs/min', xlabel='time (s)', title='Rate of transaction processing', legend=False, figsize=(10,6), grid=True), True), -] - -#queries = queries200Nodes -#queries = queriesRotating -queries = queriesVExtension - - -for (query, file_name, pandas_params, plot_average) in queries: - print(query) - - data_frame = prometheus.query_range(*query) - #Tweak the x ticks - data_frame = data_frame.set_index(md.date2num(data_frame.index)) - - - pandas_params["title"] += " - " + release - ax = data_frame.plot(**pandas_params) - if plot_average: - average = data_frame.mean(axis=1) - data_frame['__average__'] = average - pandas_params['lw'] = 8 - pandas_params['style'] = ['--'] - pandas_params['color'] = ['red'] - ax = data_frame['__average__'].plot(**pandas_params) - - ax.xaxis.set_major_formatter(md.DateFormatter('%H:%M:%S')) - plt.savefig(os.path.join(path, file_name + '.png')) - plt.plot() - -plt.show() + (( 'cometbft_p2p_peers', time_window[0], time_window[1], '1s'), 'rotating_peers', dict(ylabel='# peers', xlabel='time', title='Peers', legend=False, figsize=(10,6), grid=True), False), + (( 'avg(process_resident_memory_bytes)', time_window[0], time_window[1], '1s'), 'rotating_avg_memory', dict(ylabel='memory (bytes)', xlabel='time', title='Average Memory Usage', legend=False, figsize=(10,6), grid=True), False), + (( 'node_load1', time_window[0], time_window[1], '1s'), 'rotating_cpu', dict(ylabel='load', xlabel='time', title='Node Load', legend=False, figsize=(10,6), grid=True), False), + ] + + +def queries_vote_extensions(time_window): + return [ + (( 'cometbft_mempool_size', time_window[0], time_window[1], '1s'), 'mempool_size', dict(ylabel='TXs', xlabel='time (s)', title='Mempool Size', legend=False, figsize=(10,6), grid=True, kind='area',stacked=True), False), + (( 'cometbft_mempool_size', time_window[0], time_window[1], '1s'), 'mempool_size_not_stacked', dict(ylabel='TXs', xlabel='time (s)', title='Mempool Size', legend=False, figsize=(10,6), grid=True, stacked=False), False), + (( 'cometbft_p2p_peers', time_window[0], time_window[1], '1s'), 'peers', dict(ylabel='# Peers', xlabel='time (s)', title='Peers', legend=False, figsize=(10,6), grid=True), True), + (( 'avg(cometbft_mempool_size)', time_window[0], time_window[1], '1s'), 'avg_mempool_size', dict(ylabel='TXs', xlabel='time (s)', title='Average Mempool Size', legend=False, figsize=(10,6), grid=True), False), + (( 'cometbft_consensus_rounds', time_window[0], time_window[1], '1s'), 'rounds', dict(ylabel='# Rounds', xlabel='time (s)', title='Rounds per block', legend=False, figsize=(10,6), grid=True), False), + (( 'process_resident_memory_bytes', time_window[0], time_window[1], '1s'), 'memory', dict(ylabel='Memory (bytes)', xlabel='time (s)', title='Memory usage', legend=False, figsize=(10,6), grid=True), False), + (( 'avg(process_resident_memory_bytes)', time_window[0], time_window[1], '1s'), 'avg_memory', dict(ylabel='Memory (bytes)', xlabel='time (s)', title='Average Memory usage', legend=False, figsize=(10,6), grid=True), False), + (( 'node_load1', time_window[0], time_window[1], '1s'), 'cpu', dict(ylabel='Load', xlabel='time (s)', title='Node load', legend=False, figsize=(10,6), grid=True), False), + (( 'avg(node_load1)', time_window[0], time_window[1], '1s'), 'avg_cpu', dict(ylabel='Load', xlabel='time (s)', title='Average Node load', legend=False, figsize=(10,6), grid=True), False), + (( 'cometbft_consensus_height', time_window[0], time_window[1], '1s'), 'blocks', dict(ylabel='# Blocks', xlabel='time (s)', title='Blocks in time', legend=False, figsize=(10,6), grid=True), False), + (( 'rate(cometbft_consensus_height[20s])*60', time_window[0], time_window[1], '1s'), 'block_rate', dict(ylabel='Blocks/min', xlabel='time (s)', title='Rate of block creation', legend=False, figsize=(10,6), grid=True), True), + (( 'cometbft_consensus_total_txs', time_window[0], time_window[1], '1s'), 'total_txs', dict(ylabel='# TXs', xlabel='time (s)', title='Transactions in time', legend=False, figsize=(10,6), grid=True), False), + (( 'rate(cometbft_consensus_total_txs[20s])*60', time_window[0], time_window[1], '1s'), 'total_txs_rate', dict(ylabel='TXs/min', xlabel='time (s)', title='Rate of transaction processing', legend=False, figsize=(10,6), grid=True), True), + ] + + +def main(release, start_time, window_size, test_case): + prometheus = prometheus_query.Prometheus(PROMETHEUS_URL) + + end_time = pd.to_datetime(start_time) + pd.Timedelta(**dict(seconds=window_size)) + time_window = (start_time, end_time.strftime('%Y-%m-%dT%H:%M:%SZ')) + + ext_end_time = pd.to_datetime(start_time) + pd.Timedelta(**dict(seconds=window_size+50)) + ext_time_window = (start_time, ext_end_time.strftime('%Y-%m-%dT%H:%M:%SZ')) + + # Select queries depending on the test case. + match test_case: + case "200_nodes": + queries = queries_200_nodes(time_window, ext_time_window) + case "rotating": + queries = queries_rotating(time_window) + case "vote_extensions": + queries = queries_vote_extensions(time_window) + case _: + print(f"Error: Unknown test case {test_case}") + return + + imgs_dir = os.path.join(IMAGES_DIR, test_case) + if not os.path.exists(imgs_dir): + os.makedirs(imgs_dir) + + # Query Prometheus and plot images. + for (query, file_name, pandas_params, plot_average) in queries: + print(f"query: {query}") + + df = prometheus.query_range(*query) + #Tweak the x ticks + df = df.set_index(md.date2num(df.index)) + + if df.empty: + print('No data found! Check the timestamps or the query.') + continue + + pandas_params["title"] += " - " + release + ax = df.plot(**pandas_params) + if plot_average: + average = df.mean(axis=1) + df['__average__'] = average + pandas_params['lw'] = 8 + pandas_params['style'] = ['--'] + pandas_params['color'] = ['red'] + ax = df['__average__'].plot(**pandas_params) + + ax.xaxis.set_major_formatter(md.DateFormatter('%H:%M:%S')) + plt.savefig(os.path.join(imgs_dir, file_name + '.png')) + plt.plot() + + plt.show() + + +if __name__ == "__main__": + if len(sys.argv) < 4 or not (sys.argv[1] and sys.argv[2] and sys.argv[3] and sys.argv[4]): + usage() + + release = sys.argv[1] + start_time = sys.argv[2] + window_size = sys.argv[3] + test_case = sys.argv[4] + main(release, start_time, int(window_size), test_case) From 53154027b0baec597617d4d5588cd93c33e0970a Mon Sep 17 00:00:00 2001 From: hvanz Date: Thu, 21 Mar 2024 17:32:23 +0100 Subject: [PATCH 2/9] comment --- scripts/qa/reporting/prometheus_plotter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/qa/reporting/prometheus_plotter.py b/scripts/qa/reporting/prometheus_plotter.py index 1c5e6bcc4d..317e6a77a6 100644 --- a/scripts/qa/reporting/prometheus_plotter.py +++ b/scripts/qa/reporting/prometheus_plotter.py @@ -1,4 +1,4 @@ -# pip install numpy pandas matplotlib requests +# pip install requests matplotlib numpy pandas prometheus-pandas import os import requests import sys From 063edcd2b7529652d5167ec3efdcedcd832867dc Mon Sep 17 00:00:00 2001 From: hvanz Date: Fri, 22 Mar 2024 00:32:23 +0100 Subject: [PATCH 3/9] update latency scripts --- scripts/qa/reporting/README.md | 51 ++-- scripts/qa/reporting/latency_plotter.py | 292 +++++++++++---------- scripts/qa/reporting/latency_throughput.py | 50 ++-- 3 files changed, 198 insertions(+), 195 deletions(-) diff --git a/scripts/qa/reporting/README.md b/scripts/qa/reporting/README.md index 1bd0067a6b..2f5e6e70ac 100644 --- a/scripts/qa/reporting/README.md +++ b/scripts/qa/reporting/README.md @@ -17,8 +17,8 @@ This directory contains some utility scripts used in the reporting/QA. ## Setup -Execute the following within this directory (the same directory as the -`latency_throughput.py` file). +Before running the Python scripts, execute the following within this directory (the same directory +as the `latency_throughput.py` file). ```bash # Create a virtual environment into which to install your dependencies @@ -32,46 +32,35 @@ pip install -r requirements.txt ``` ## Latency vs Throughput Plotting -To show the instructions and parameter options, execute +To show the instructions and parameter options, execute ```bash -./latency_throughput.py --help +python3 latency_throughput.py --help ``` +Be sure that the virtual environment is enabled before running the script. -Example: - +For example, the following command will generate a PNG file called `cmt_v1.png` in the current +directory based on the `raw.csv` file generated by the reporting tool. The `-t` flag overrides the +default title at the top of the plot. ```bash -# Do the following while ensuring that the virtual environment is activated (see -# the Setup steps). -# -# This will generate a plot in a PNG file called 'tm034.png' in the current -# directory based on the reporting tool CSV output in the "raw.csv" file. The -# '-t' flag overrides the default title at the top of the plot. - -./latency_throughput.py \ - -t 'CometBFT v0.34.x Latency vs Throughput' \ - ./tm034.png \ - /path/to/csv/files/raw.csv +./latency_throughput.py -t 'CometBFT v1.x Latency vs Throughput' ./cmt_v1.png /path/to/results/raw.csv ``` ## Latency vs Throughput Plotting (version 2) -Example: +The `latency_plotter.py` script generates a series of plots in the `imgs` folder. +Plots include combined experiment plots and experiments as subplots. +- `all_experiments`: plots of all experiments as individual subplots. +- `all_configs`: plots of all experiments, grouped by configuration (r,c). +- `cXrY.png`: Independent plot of experiments of configuration (c=X,r=Y) as different curves. +- `cXrY_merged.png`: Independent plot of experiments of configuration (c=X,r=Y) combined as single curve. +- `e_ID.png`: independent plot with just experiment with id ID as a single curve. + +Example: ```bash -# Do the following while ensuring that the virtual environment is activated (see -# the Setup steps). -# -# This will generate a series of plots in the `imgs` folder. -# Plots include combined experiment plots and experiments as subplots. -# - all_experiments - plots of all experiments as individual subplots. -# - all_configs - plots of all experiments, grouped by configuration (r,c). -# cXrY.png - Independent plot of experiments of configuration (c=X,r=Y) as different curves. -# cXrY_merged.png - Independent plot of experiments of configuration (c=X,r=Y) combined as single curve. -# e_ID.png - independent plot with just experiment with id ID as a single curve. - -mkdir -p imgs -python3 latency_plotter.py /path/to/csv/files/raw.csv +python3 latency_plotter.py v1.0.0-alpha.2 /path/to/results/raw.csv ``` +Be sure that the virtual environment is enabled before running the script. ## Prometheus metrics diff --git a/scripts/qa/reporting/latency_plotter.py b/scripts/qa/reporting/latency_plotter.py index 3b42eedff8..426d53c931 100644 --- a/scripts/qa/reporting/latency_plotter.py +++ b/scripts/qa/reporting/latency_plotter.py @@ -1,150 +1,168 @@ import sys import os -from datetime import datetime import pytz +from datetime import datetime import matplotlib as mpl import matplotlib.pyplot as plt - import numpy as np import pandas as pd -release = 'v0.38.0-alpha2' +IMAGES_DIR = 'imgs' + + +def usage(): + print(f"Usage: {sys.argv[0]} release_name raw_csv_path") + exit(1) + #FIXME: figure out in which timezone prometheus was running to adjust to UTC. -tz = pytz.timezone('America/Sao_Paulo') - -if len(sys.argv) != 2: - print('Pls provide the raw.csv file') - exit() -else: - csvpath = sys.argv[1] - if not os.path.exists(csvpath): - print('Pls provide a valid the raw.csv file') - exit() +tz = pytz.timezone('UTC') + + +def plot_all_experiments(release, csv): + # Group by experiment + groups = csv.groupby(['experiment_id']) + + # number of rows and columns in the graph + ncols = 2 if groups.ngroups > 1 else 1 + nrows = int( np.ceil(groups.ngroups / ncols)) if groups.ngroups > 1 else 1 + fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(6*ncols, 4*nrows), sharey=False) + fig.tight_layout(pad=5.0) + + # Plot experiments as subplots + for (key,ax) in zip(groups.groups.keys(), [axes] if ncols == 1 else axes.flatten()): + group = groups.get_group(key) + ax.set_ylabel('latency (s)') + ax.set_xlabel('experiment time (s)') + ax.set_title(key) + ax.grid(True) + + # Group by connection number and transaction rate + paramGroups = group.groupby(['connections','rate']) + for (subKey) in paramGroups.groups.keys(): + subGroup = paramGroups.get_group(subKey) + startTime = subGroup.block_time.min() + endTime = subGroup.block_time.max() + localStartTime = tz.localize(datetime.fromtimestamp(startTime)).astimezone(pytz.utc) + localEndTime = tz.localize(datetime.fromtimestamp(endTime)).astimezone(pytz.utc) + subGroup.block_time.apply(lambda x: x - startTime ) + mean = subGroup.duration_ns.mean() + print('exp', key ,'start', localEndTime.strftime("%Y-%m-%dT%H:%M:%SZ"), 'end', localStartTime.strftime("%Y-%m-%dT%H:%M:%SZ"), 'duration', endTime - startTime, "mean", mean) + + (con,rate) = subKey + label = 'c='+str(con) + ' r='+ str(rate) + ax.axhline(y = mean, color = 'r', linestyle = '-', label="mean") + ax.scatter(subGroup.block_time, subGroup.duration_ns, label=label) + ax.legend() + + # Save individual axes + extent = ax.get_window_extent().transformed(fig.dpi_scale_trans.inverted()) + img_path = os.path.join(IMAGES_DIR, f'e_{key}.png') + fig.savefig(img_path, bbox_inches=extent.expanded(1.2, 1.3)) + + fig.suptitle('Vote Extensions Testnet - ' + release) + + # Save the figure with subplots + fig.savefig(os.path.join(IMAGES_DIR, 'all_experiments.png')) + + +def plot_all_configs(release, csv): + # Group by configuration + groups = csv.groupby(['connections','rate']) + + # number of rows and columns in the graph + ncols = 2 if groups.ngroups > 1 else 1 + nrows = int( np.ceil(groups.ngroups / ncols)) if groups.ngroups > 1 else 1 + fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(6*ncols, 4*nrows), sharey=True) + fig.tight_layout(pad=5.0) + + # Plot configurations as subplots + for (key,ax) in zip(groups.groups.keys(), [axes] if ncols == 1 else axes.flatten()): + group = groups.get_group(key) + ax.set_ylabel('latency (s)') + ax.set_xlabel('experiment time (s)') + ax.grid(True) + (con,rate) = key + label = 'c='+str(con) + ' r='+ str(rate) + ax.set_title(label) + + # Group by experiment + paramGroups = group.groupby(['experiment_id']) + for (subKey) in paramGroups.groups.keys(): + subGroup = paramGroups.get_group((subKey,)) + startTime = subGroup.block_time.min() + subGroupMod = subGroup.block_time.apply(lambda x: x - startTime) + ax.scatter(subGroupMod, subGroup.duration_ns, label=label) + #ax.legend() - print(csvpath) - -path = os.path.join('imgs') - -#Load the CSV -csv = pd.read_csv(csvpath) - -#Transform ns to s in the latency/duration -csv['duration_ns'] = csv['duration_ns'].apply(lambda x: x/10**9) -csv['block_time'] = csv['block_time'].apply(lambda x: x/10**9) - -#Group by experiment -groups = csv.groupby(['experiment_id']) - -#number of rows and columns in the graph -ncols = 2 if groups.ngroups > 1 else 1 -nrows = int( np.ceil(groups.ngroups / ncols)) if groups.ngroups > 1 else 1 -fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(6*ncols, 4*nrows), sharey=False) -fig.tight_layout(pad=5.0) - - -#Plot experiments as subplots -for (key,ax) in zip(groups.groups.keys(), [axes] if ncols == 1 else axes.flatten()): - group = groups.get_group(key) - ax.set_ylabel('latency (s)') - ax.set_xlabel('experiment time (s)') - ax.set_title(key) - ax.grid(True) - - #Group by connection number and transaction rate - paramGroups = group.groupby(['connections','rate']) - for (subKey) in paramGroups.groups.keys(): - subGroup = paramGroups.get_group(subKey) - startTime = subGroup.block_time.min() - endTime = subGroup.block_time.max() - localStartTime = tz.localize(datetime.fromtimestamp(startTime)).astimezone(pytz.utc) - localEndTime = tz.localize(datetime.fromtimestamp(endTime)).astimezone(pytz.utc) - subGroup.block_time.apply(lambda x: x - startTime ) - mean = subGroup.duration_ns.mean() - print('exp', key ,'start', localEndTime.strftime("%Y-%m-%dT%H:%M:%SZ"), 'end', localStartTime.strftime("%Y-%m-%dT%H:%M:%SZ"), 'duration', endTime - startTime, "mean", mean) - - (con,rate) = subKey + + #Save individual axes + extent = ax.get_window_extent().transformed(fig.dpi_scale_trans.inverted()) + img_path = os.path.join(IMAGES_DIR, f'c{con}r{rate}.png') + fig.savefig(img_path, bbox_inches=extent.expanded(1.2, 1.3)) + + fig.suptitle('Vote Extensions Testnet - ' + release) + + # Save the figure with subplots + fig.savefig(os.path.join(IMAGES_DIR, 'all_configs.png')) + + +def plot_merged(release, csv): + # Group by configuration + groups = csv.groupby(['connections','rate']) + + # number of rows and columns in the graph + ncols = 2 if groups.ngroups > 1 else 1 + nrows = int( np.ceil(groups.ngroups / ncols)) if groups.ngroups > 1 else 1 + fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(6*ncols, 4*nrows), sharey=True) + fig.tight_layout(pad=5.0) + + # Plot configurations as subplots + for (key,ax) in zip(groups.groups.keys(), [axes] if ncols == 1 else axes.flatten()): + group = groups.get_group(key) + ax.set_ylabel('latency (s)') + ax.set_xlabel('experiment time (s)') + ax.grid(True) + (con,rate) = key label = 'c='+str(con) + ' r='+ str(rate) - ax.axhline(y = mean, color = 'r', linestyle = '-', label="mean") - ax.scatter(subGroup.block_time, subGroup.duration_ns, label=label) - ax.legend() - - #Save individual axes - extent = ax.get_window_extent().transformed(fig.dpi_scale_trans.inverted()) - fig.savefig(os.path.join(path,'e_'+key + '.png'), bbox_inches=extent.expanded(1.2, 1.3)) - -fig.suptitle('Vote Extensions Testnet - ' + release) - -# Save the figure with subplots -fig.savefig(os.path.join(path,'all_experiments.png')) - - - -#Group by configuration -groups = csv.groupby(['connections','rate']) - -#number of rows and columns in the graph -ncols = 2 if groups.ngroups > 1 else 1 -nrows = int( np.ceil(groups.ngroups / ncols)) if groups.ngroups > 1 else 1 -fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(6*ncols, 4*nrows), sharey=True) -fig.tight_layout(pad=5.0) - -#Plot configurations as subplots -for (key,ax) in zip(groups.groups.keys(), [axes] if ncols == 1 else axes.flatten()): - group = groups.get_group(key) - ax.set_ylabel('latency (s)') - ax.set_xlabel('experiment time (s)') - ax.grid(True) - (con,rate) = key - label = 'c='+str(con) + ' r='+ str(rate) - ax.set_title(label) - - #Group by experiment - paramGroups = group.groupby(['experiment_id']) - for (subKey) in paramGroups.groups.keys(): - subGroup = paramGroups.get_group(subKey) - startTime = subGroup.block_time.min() - subGroupMod = subGroup.block_time.apply(lambda x: x - startTime) - ax.scatter(subGroupMod, subGroup.duration_ns, label=label) - #ax.legend() - - - #Save individual axes - extent = ax.get_window_extent().transformed(fig.dpi_scale_trans.inverted()) - fig.savefig(os.path.join(path,'c'+str(con) + 'r'+ str(rate) + '.png'), bbox_inches=extent.expanded(1.2, 1.3)) - -fig.suptitle('Vote Extensions Testnet - ' + release) - - -# Save the figure with subplots -fig.savefig(os.path.join(path,'all_configs.png')) - - -fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(6*ncols, 4*nrows), sharey=True) -fig.tight_layout(pad=5.0) - -#Plot configurations as subplots -for (key,ax) in zip(groups.groups.keys(), [axes] if ncols == 1 else axes.flatten()): - group = groups.get_group(key) - ax.set_ylabel('latency (s)') - ax.set_xlabel('experiment time (s)') - ax.grid(True) - (con,rate) = key - label = 'c='+str(con) + ' r='+ str(rate) - ax.set_title(label) - - #Group by experiment, but merge them as a single experiment - paramGroups = group.groupby(['experiment_id']) - for (subKey) in paramGroups.groups.keys(): - subGroup = paramGroups.get_group(subKey) - startTime = subGroup.block_time.min() - subGroupMod = subGroup.block_time.apply(lambda x: x - startTime) - ax.scatter(subGroupMod, subGroup.duration_ns, marker='o',c='#1f77b4') - - #Save individual axes - extent = ax.get_window_extent().transformed(fig.dpi_scale_trans.inverted()) - (con,rate) = key - fig.savefig(os.path.join(path,'c'+str(con) + 'r'+ str(rate) + '_merged.png'), bbox_inches=extent) - -plt.show() + ax.set_title(label) + + # Group by experiment, but merge them as a single experiment + paramGroups = group.groupby(['experiment_id']) + for (subKey) in paramGroups.groups.keys(): + subGroup = paramGroups.get_group((subKey,)) + startTime = subGroup.block_time.min() + subGroupMod = subGroup.block_time.apply(lambda x: x - startTime) + ax.scatter(subGroupMod, subGroup.duration_ns, marker='o',c='#1f77b4') + + # Save individual axes + extent = ax.get_window_extent().transformed(fig.dpi_scale_trans.inverted()) + (con, rate) = key + img_path = os.path.join(IMAGES_DIR, f'c{con}r{rate}_merged.png') + fig.savefig(img_path, bbox_inches=extent) + + plt.show() + + +if __name__ == "__main__": + if len(sys.argv) < 2 or not (sys.argv[1] and sys.argv[2]): + usage() + release = sys.argv[1] + csv_path = sys.argv[2] + + if not os.path.exists(csv_path): + print('Please provide a valid raw.csv file') + exit() + csv = pd.read_csv(csv_path) + + # Transform ns to s in the latency/duration + csv['duration_ns'] = csv['duration_ns'].apply(lambda x: x/10**9) + csv['block_time'] = csv['block_time'].apply(lambda x: x/10**9) + + if not os.path.exists(IMAGES_DIR): + os.makedirs(IMAGES_DIR) + + plot_all_experiments(release, csv) + plot_all_configs(release, csv) + plot_merged(release, csv) diff --git a/scripts/qa/reporting/latency_throughput.py b/scripts/qa/reporting/latency_throughput.py index adaa4b76ca..75bb744fdc 100755 --- a/scripts/qa/reporting/latency_throughput.py +++ b/scripts/qa/reporting/latency_throughput.py @@ -15,33 +15,8 @@ import matplotlib.pyplot as plt import numpy as np -DEFAULT_TITLE = "CometBFT latency vs throughput" - - -def main(): - parser = argparse.ArgumentParser( - description="Renders a latency vs throughput diagram " - "for a set of transactions provided by the loadtime reporting tool", - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('-t', - '--title', - default=DEFAULT_TITLE, - help='Plot title') - parser.add_argument('output_image', - help='Output image file (in PNG format)') - parser.add_argument( - 'input_csv_file', - nargs='+', - help="CSV input file from which to read transaction data " - "- must have been generated by the loadtime reporting tool") - args = parser.parse_args() - logging.basicConfig(format='%(levelname)s\t%(message)s', - stream=sys.stdout, - level=logging.INFO) - plot_latency_vs_throughput(args.input_csv_file, - args.output_image, - title=args.title) +DEFAULT_TITLE = "CometBFT latency vs throughput" def plot_latency_vs_throughput(input_files, output_image, title=DEFAULT_TITLE): @@ -167,4 +142,25 @@ def compute_experiments_stats(experiments): if __name__ == "__main__": - main() + parser = argparse.ArgumentParser( + description="Renders a latency vs throughput diagram " + "for a set of transactions provided by the loadtime reporting tool", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('-t', + '--title', + default=DEFAULT_TITLE, + help='Plot title') + parser.add_argument('output_image', + help='Output image file (in PNG format)') + parser.add_argument( + 'input_csv_file', + nargs='+', + help="CSV input file from which to read transaction data " + "- must have been generated by the loadtime reporting tool") + args = parser.parse_args() + + logging.basicConfig(format='%(levelname)s\t%(message)s', + stream=sys.stdout, + level=logging.INFO) + + plot_latency_vs_throughput(args.input_csv_file, args.output_image, title=args.title) From ce8451724179dc1b8f9586a4c6c6d203ce71a913 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hern=C3=A1n=20Vanzetto?= <15466498+hvanz@users.noreply.github.com> Date: Mon, 25 Mar 2024 16:04:09 +0100 Subject: [PATCH 4/9] Update scripts/qa/reporting/README.md Co-authored-by: lasaro --- scripts/qa/reporting/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/qa/reporting/README.md b/scripts/qa/reporting/README.md index 2f5e6e70ac..4d85ec7ced 100644 --- a/scripts/qa/reporting/README.md +++ b/scripts/qa/reporting/README.md @@ -43,7 +43,7 @@ For example, the following command will generate a PNG file called `cmt_v1.png` directory based on the `raw.csv` file generated by the reporting tool. The `-t` flag overrides the default title at the top of the plot. ```bash -./latency_throughput.py -t 'CometBFT v1.x Latency vs Throughput' ./cmt_v1.png /path/to/results/raw.csv +python3 latency_throughput.py -t 'CometBFT v1.x Latency vs Throughput' ./cmt_v1.png /path/to/results/raw.csv ``` ## Latency vs Throughput Plotting (version 2) From c1eca942b9ab6368f777f2c5ac3a1157f8316905 Mon Sep 17 00:00:00 2001 From: hvanz Date: Tue, 26 Mar 2024 08:45:25 +0100 Subject: [PATCH 5/9] Make python files executable --- scripts/qa/reporting/README.md | 10 +++++----- scripts/qa/reporting/latency_plotter.py | 2 ++ scripts/qa/reporting/latency_throughput.py | 1 + scripts/qa/reporting/prometheus_plotter.py | 3 +++ 4 files changed, 11 insertions(+), 5 deletions(-) mode change 100644 => 100755 scripts/qa/reporting/latency_plotter.py mode change 100644 => 100755 scripts/qa/reporting/prometheus_plotter.py diff --git a/scripts/qa/reporting/README.md b/scripts/qa/reporting/README.md index 4d85ec7ced..a285d8c2f0 100644 --- a/scripts/qa/reporting/README.md +++ b/scripts/qa/reporting/README.md @@ -1,6 +1,6 @@ # Reporting Scripts -This directory contains some utility scripts used in the reporting/QA. +This directory contains some utility scripts used for generating reports of QA processes. * [`latency_throughput.py`](./latency_throughput.py) is a Python script that uses [matplotlib] to plot a graph of transaction latency vs throughput rate based on @@ -35,7 +35,7 @@ pip install -r requirements.txt To show the instructions and parameter options, execute ```bash -python3 latency_throughput.py --help +./latency_throughput.py --help ``` Be sure that the virtual environment is enabled before running the script. @@ -43,7 +43,7 @@ For example, the following command will generate a PNG file called `cmt_v1.png` directory based on the `raw.csv` file generated by the reporting tool. The `-t` flag overrides the default title at the top of the plot. ```bash -python3 latency_throughput.py -t 'CometBFT v1.x Latency vs Throughput' ./cmt_v1.png /path/to/results/raw.csv +./latency_throughput.py -t 'CometBFT v1.x Latency vs Throughput' ./cmt_v1.png /path/to/results/raw.csv ``` ## Latency vs Throughput Plotting (version 2) @@ -58,7 +58,7 @@ Plots include combined experiment plots and experiments as subplots. Example: ```bash -python3 latency_plotter.py v1.0.0-alpha.2 /path/to/results/raw.csv +./latency_plotter.py v1.0.0-alpha.2 /path/to/results/raw.csv ``` Be sure that the virtual environment is enabled before running the script. @@ -70,7 +70,7 @@ Before running the script, check that a Prometheus server in `localhost:9090`. T Run the script from the virtual environment as follows: ```bash -python3 prometheus_plotter.py +./prometheus_plotter.py ``` For details and examples of how to run the script, just run `python3 prometheus_plotter.py` diff --git a/scripts/qa/reporting/latency_plotter.py b/scripts/qa/reporting/latency_plotter.py old mode 100644 new mode 100755 index 426d53c931..a5c6b320d6 --- a/scripts/qa/reporting/latency_plotter.py +++ b/scripts/qa/reporting/latency_plotter.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + import sys import os import pytz diff --git a/scripts/qa/reporting/latency_throughput.py b/scripts/qa/reporting/latency_throughput.py index 75bb744fdc..c048068171 100755 --- a/scripts/qa/reporting/latency_throughput.py +++ b/scripts/qa/reporting/latency_throughput.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 + """ A simple script to parse the CSV output from the loadtime reporting tool (see https://github.com/cometbft/cometbft/tree/main/test/loadtime/cmd/report). diff --git a/scripts/qa/reporting/prometheus_plotter.py b/scripts/qa/reporting/prometheus_plotter.py old mode 100644 new mode 100755 index 317e6a77a6..8dbe0ed3a0 --- a/scripts/qa/reporting/prometheus_plotter.py +++ b/scripts/qa/reporting/prometheus_plotter.py @@ -1,3 +1,6 @@ +#!/usr/bin/env python3 + +# Requirements: # pip install requests matplotlib numpy pandas prometheus-pandas import os import requests From 10b97c8fa92cd9c484556f2b22cd752d04c0bcb2 Mon Sep 17 00:00:00 2001 From: lasarojc Date: Tue, 26 Mar 2024 12:13:14 -0300 Subject: [PATCH 6/9] Fix start time of experiments to 0. --- scripts/qa/reporting/latency_plotter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/qa/reporting/latency_plotter.py b/scripts/qa/reporting/latency_plotter.py index a5c6b320d6..80684cb1e1 100755 --- a/scripts/qa/reporting/latency_plotter.py +++ b/scripts/qa/reporting/latency_plotter.py @@ -46,10 +46,10 @@ def plot_all_experiments(release, csv): subGroup = paramGroups.get_group(subKey) startTime = subGroup.block_time.min() endTime = subGroup.block_time.max() + subGroup.block_time = subGroup.block_time.apply(lambda x: x - startTime ) + mean = subGroup.duration_ns.mean() localStartTime = tz.localize(datetime.fromtimestamp(startTime)).astimezone(pytz.utc) localEndTime = tz.localize(datetime.fromtimestamp(endTime)).astimezone(pytz.utc) - subGroup.block_time.apply(lambda x: x - startTime ) - mean = subGroup.duration_ns.mean() print('exp', key ,'start', localEndTime.strftime("%Y-%m-%dT%H:%M:%SZ"), 'end', localStartTime.strftime("%Y-%m-%dT%H:%M:%SZ"), 'duration', endTime - startTime, "mean", mean) (con,rate) = subKey From 76a7c3c719f5a35a76ad2149d44a38265995b423 Mon Sep 17 00:00:00 2001 From: hvanz Date: Wed, 27 Mar 2024 08:53:56 +0100 Subject: [PATCH 7/9] fix logging message --- scripts/qa/reporting/latency_plotter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/qa/reporting/latency_plotter.py b/scripts/qa/reporting/latency_plotter.py index 80684cb1e1..2cedc24106 100755 --- a/scripts/qa/reporting/latency_plotter.py +++ b/scripts/qa/reporting/latency_plotter.py @@ -50,7 +50,7 @@ def plot_all_experiments(release, csv): mean = subGroup.duration_ns.mean() localStartTime = tz.localize(datetime.fromtimestamp(startTime)).astimezone(pytz.utc) localEndTime = tz.localize(datetime.fromtimestamp(endTime)).astimezone(pytz.utc) - print('exp', key ,'start', localEndTime.strftime("%Y-%m-%dT%H:%M:%SZ"), 'end', localStartTime.strftime("%Y-%m-%dT%H:%M:%SZ"), 'duration', endTime - startTime, "mean", mean) + print('experiment', key ,'start', localStartTime.strftime("%Y-%m-%dT%H:%M:%SZ"), 'end', localEndTime.strftime("%Y-%m-%dT%H:%M:%SZ"), 'duration', endTime - startTime, "mean", mean) (con,rate) = subKey label = 'c='+str(con) + ' r='+ str(rate) From baa63780c2630ce868698b5e6bc79e52dd869f07 Mon Sep 17 00:00:00 2001 From: hvanz Date: Wed, 27 Mar 2024 11:18:27 +0100 Subject: [PATCH 8/9] fix arguments check --- scripts/qa/reporting/prometheus_plotter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/qa/reporting/prometheus_plotter.py b/scripts/qa/reporting/prometheus_plotter.py index 8dbe0ed3a0..0f80625e74 100755 --- a/scripts/qa/reporting/prometheus_plotter.py +++ b/scripts/qa/reporting/prometheus_plotter.py @@ -142,7 +142,7 @@ def main(release, start_time, window_size, test_case): if __name__ == "__main__": - if len(sys.argv) < 4 or not (sys.argv[1] and sys.argv[2] and sys.argv[3] and sys.argv[4]): + if len(sys.argv) < 5 or not (sys.argv[1] and sys.argv[2] and sys.argv[3] and sys.argv[4]): usage() release = sys.argv[1] From 50e7e72d4b6c567632b4631db5f56d8bf346a2b2 Mon Sep 17 00:00:00 2001 From: hvanz Date: Thu, 4 Apr 2024 18:20:34 +0200 Subject: [PATCH 9/9] revert changes to method.md --- docs/references/qa/method.md | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/docs/references/qa/method.md b/docs/references/qa/method.md index e55ba5d947..4127473fe5 100644 --- a/docs/references/qa/method.md +++ b/docs/references/qa/method.md @@ -159,14 +159,12 @@ The CometBFT team should improve it at every iteration to increase the amount of [`latency_throughput.py`]: ../../../scripts/qa/reporting/README.md#Latency-vs-Throughput-Plotting [`latency_plotter.py`]: ../../../scripts/qa/reporting/README.md#Latency-vs-Throughput-Plotting-version-2 -#### Extract Prometheus Metrics +#### Extracting Prometheus Metrics 1. Stop the prometheus server if it is running as a service (e.g. a `systemd` unit). -2. Unzip the prometheus database retrieved from the testnet. +2. Unzip the prometheus database retrieved from the testnet, and move it to replace the + local prometheus database. 3. Start the prometheus server and make sure no error logs appear at start up. - ```bash - prometheus --storage.tsdb.path=path/to/prometheus/data --config.file=path/to/prometheus.yml - ``` 4. Identify the time window you want to plot in your graphs. 5. Execute the [`prometheus_plotter.py`] script for the time window.