From f4b58bf193443b0a370b388e09f5d07843620428 Mon Sep 17 00:00:00 2001 From: Benjamin Wohlwend Date: Wed, 7 Feb 2018 12:04:02 +0100 Subject: [PATCH] added documentation on how to reduce overhead (#153) also, added documentation for the sample_rate config, and reduced the flush_interval setting to 10s closes #153 --- docs/configuration.asciidoc | 19 +++++-- docs/index.asciidoc | 1 + docs/tuning.asciidoc | 75 ++++++++++++++++++++++++++++ elasticapm/conf/__init__.py | 2 +- tests/contrib/django/django_tests.py | 5 +- 5 files changed, 97 insertions(+), 5 deletions(-) create mode 100644 docs/tuning.asciidoc diff --git a/docs/configuration.asciidoc b/docs/configuration.asciidoc index dcf2a4d39..ad7650323 100644 --- a/docs/configuration.asciidoc +++ b/docs/configuration.asciidoc @@ -346,7 +346,7 @@ If your service handles data like this, we advise to only enable this feature wi |============ | Environment | Django/Flask | Default -| `ELASTIC_APM_FLUSH_INTERVAL` | `FLUSH_INTERVAL` | `60` +| `ELASTIC_APM_FLUSH_INTERVAL` | `FLUSH_INTERVAL` | `10` |============ Interval with which transactions should be sent to the APM server, in seconds. @@ -374,8 +374,8 @@ Setting an upper limit will prevent overloading the agent and the APM server wit ==== `max_queue_size` |============ -| Environment | Django/Flask | Default -| `ELASTIC_APM_MAX_EVENT_QUEUE_LENGTH` | `MAX_QUEUE_SIZE` | `500` +| Environment | Django/Flask | Default +| `ELASTIC_APM_MAX_QUEUE_SIZE` | `MAX_QUEUE_SIZE` | `500` |============ Maximum queue length of transactions before sending transactions to the APM server. @@ -405,6 +405,19 @@ For more information, see <>. WARNING: We recommend to always include the default set of validators if you customize this setting. +[float] +[[config-transaction-sample-rate]] +==== `transaction_sample_rate` + +|============ +| Environment | Django/Flask | Default +| `ELASTIC_APM_TRANSACTION_SAMPLE_RATE` | `TRANSACTION_SAMPLE_RATE` | `1.0` +|============ + +By default, the agent will sample every transaction (e.g. request to your service). +To reduce overhead and storage requirements, you can set the sample rate to a value between `0.0` and `1.0`. +We still record overall time and the result for unsampled transactions, but no context information, tags, or spans. + [float] [[config-include-paths]] ==== `include_paths` diff --git a/docs/index.asciidoc b/docs/index.asciidoc index 5086d9141..a53c79a98 100644 --- a/docs/index.asciidoc +++ b/docs/index.asciidoc @@ -39,3 +39,4 @@ include::./sanitizing-data.asciidoc[Sanitizing Data] include::./run-tests-locally.asciidoc[Run Tests Locally] include::./api.asciidoc[API documentation] +include::./tuning.asciidoc[Tuning and Overhead considerations] diff --git a/docs/tuning.asciidoc b/docs/tuning.asciidoc new file mode 100644 index 000000000..7cb9c4bac --- /dev/null +++ b/docs/tuning.asciidoc @@ -0,0 +1,75 @@ +[[tuning-and-overhead]] +== Tuning and Overhead considerations + +Using an APM solution comes with certain trade-offs, and the Python agent for Elastic APM is no different. +Instrumenting your code, measuring timings, recording context data etc. all need resources: + + * CPU time + * memory + * bandwidth use + * Elasticsearch storage + +We invested and continue to invest a lot of effort to keep the overhead of using Elastic APM as low as possible. +But because every deployment is different, there are some knobs you can turn to adapt it to your specific needs. + +[float] +[[tuning-sample-rate]] +=== Transaction Sample Rate + +The most straight forward way to reduce the overhead of the agent is to tell the agent to do less. +If you set the <> to a value below `1.0`, +the agent will randomly sample only a subset of transactions. +If a transaction is not sampled, the agent has to do a lot less work, +as we only record the the name of the transaction, the overall transaction time and the result for unsampled transactions. + +[options="header"] +|============ +| Field | Sampled | Unsampled +| Transaction name | yes | yes +| Duration | yes | yes +| Result | yes | yes +| Context | yes | no +| Tags | yes | no +| Spans | yes | no +|============ + +Reducing the sample rate to a fraction of all transactions can make a huge difference in all four of the mentioned resource types. + +[float] +[[tuning-queue]] +=== Transaction Queue + +To reduce the load on the APM Server, the agent does not send every transaction up as it happens. +Instead, it queues them up, and flushes the queue periodically, or when it reaches a maximum size, using a background thread. + +While this reduces the load on the APM Server (and to a certain extent on the agent), +holding on to the transaction data in a queue uses memory. +If you notice that using the Python agent results in a large increase of memory use, +you can use these settings: + + * <> to reduce the time between queue flushes + * <> to reduce the maximum size of the queue + +The first setting, `flush_interval`, is helpful if you have a sustained high number of transactions. +The second setting, `max_queue_size`, can help if you experience peaks of transactions +(a large amount of transactions in a short period of time). + +Keep in mind that reducing the value of either setting will cause the agent to send more HTTP requests to the APM Server, +potentially causing a higher load. + + +[float] +[[tuning-max-spans]] +=== Spans per transaction + +The average amount of spans per transaction can influence how much time the agent spends in each transaction collecting contextual data for each span, +and the the storage space needed in Elasticsearch. +In our experience, most usual transactions should have well below 100 spans. +In some cases however, the number of spans can explode: + + * long-running transactions + * unoptimized code, e.g. doing hundreds of SQL queries in a loop + +To avoid that such edge cases overload both the agent and the APM Server, +the agent stops recording spans when a limit is reached. +You can configure this limit by changing the <> setting. diff --git a/elasticapm/conf/__init__.py b/elasticapm/conf/__init__.py index f7d1e5ed7..d6f206b2e 100644 --- a/elasticapm/conf/__init__.py +++ b/elasticapm/conf/__init__.py @@ -155,7 +155,7 @@ class Config(_ConfigBase): 'elasticapm.processors.sanitize_http_request_querystring', 'elasticapm.processors.sanitize_http_request_body', ]) - flush_interval = _ConfigValue('FLUSH_INTERVAL', type=int, default=60) + flush_interval = _ConfigValue('FLUSH_INTERVAL', type=int, default=10) transaction_sample_rate = _ConfigValue('TRANSACTION_SAMPLE_RATE', type=float, default=1.0) transaction_max_spans = _ConfigValue('TRANSACTION_MAX_SPANS', type=int, default=500) max_queue_size = _ConfigValue('MAX_QUEUE_SIZE', type=int, default=500) diff --git a/tests/contrib/django/django_tests.py b/tests/contrib/django/django_tests.py index 5a9be8fda..834f52296 100644 --- a/tests/contrib/django/django_tests.py +++ b/tests/contrib/django/django_tests.py @@ -1030,7 +1030,10 @@ def test_perf_database_render_no_instrumentation(benchmark, django_elasticapm_cl @pytest.mark.django_db -@pytest.mark.parametrize('django_elasticapm_client', [{'_wait_to_first_send': 100}], indirect=True) +@pytest.mark.parametrize('django_elasticapm_client', [{ + '_wait_to_first_send': 100, + 'flush_interval': 100 +}], indirect=True) def test_perf_transaction_with_collection(benchmark, django_elasticapm_client): django_elasticapm_client.instrumentation_store.get_all() with mock.patch("elasticapm.traces.TransactionsStore.should_collect") as should_collect: