Skip to content

Commit

Permalink
[Profiling] Calculate CO2 emission and costs (#101979)
Browse files Browse the repository at this point in the history
This PR moves the cost and CO2 calculations from Kibana into the ES profiling API.
The calculations are improved by applying AWS price lists and co2/cpu/energy data from instance types (AWS, GCP, Azure).
The stacktraces and flamegraph APIs are extended with the cost and CO2  values in a backwards compatible way.

* Add CO2 and costs calculator
* Query stacktrace events aggregated by (host.id,stacktrace.id)
* Add co2Factor, costFactor and count to StackTraces
* Add co2Factor, costFactor and count to Flamegraph
* Require field 'requested_duration' in stacktraces request
* Implement a PoC calculation for co2 and costs
* Rename co2/costs response fields
* Use helper function for fetching single values from search response
* ArrayList -> List
* Remove hot debug logs
* Load profiling costs from file
* Update profiling-costs.json.gz
* Use method references for StopWatch.report()
* Do co2/cost in parallel to stacktrace metadata queries
* Move CO2 calculations into a separate class
* Move cost calculations into a separate class
* Cleanups / address comments
* Restore stack_trace_events array and fix tests
* Add comments for the query bucket sizes
* Fix tests
* Fix HostMetadata
* Cleanups
* More cleanups
* Extract DatacenterInstance from host metadata
* Improve host metadata query to only fetch relevant docs
* Calculate wattsPerCore from host CPU architecture
* Get PUE and CO2TonsPerKWH from provider and region
* Remove filter for AWS host metadata
* Add changelog
* Fix host metadata query and query sizes
* Add unit test for HostMetadata
* Reduce number of host metadata docs to be retrieved
* Add unit tests for CO2 Calculator
* Add unit tests for cost Calculator
* Fix tests for CO2 calculations for known instance type
* Address PR comments
* Add region to InstanceType.toString()
* Add optional request param 'custom_cost_factor'
* Use TraceEvent instead of Long in GetStackTracesResponse.java
* Fix integration tests
* Add customCostFactor to cost calculator
* Fix exception in KvIndexResolver#resolve in DEBUG mode
* Add integration tests
* Rename CostsService -> InstanceTypeService
* Apply review comments
* Use import static java.util.Map.entry
* Remove debug log from KvIndexResolver
* Address review comments
* Address review comments

---------

Co-authored-by: Daniel Mitterdorfer <daniel.mitterdorfer@elastic.co>
  • Loading branch information
rockdaboot and danielmitterdorfer committed Nov 23, 2023
1 parent 1dfaac7 commit 0cce4ae
Show file tree
Hide file tree
Showing 36 changed files with 1,375 additions and 173 deletions.
5 changes: 5 additions & 0 deletions docs/changelog/101979.yaml
@@ -0,0 +1,5 @@
pr: 101979
summary: Calculate CO2 and emmission and costs
area: Application
type: enhancement
issues: []
Expand Up @@ -94,7 +94,7 @@ public static <T extends ToXContent> Iterator<ToXContent> wrapWithObject(String
return Iterators.concat(startObject(name), iterator, endObject());
}

private static <T> Iterator<ToXContent> map(String name, Map<String, T> map, Function<Map.Entry<String, T>, ToXContent> toXContent) {
public static <T> Iterator<ToXContent> map(String name, Map<String, T> map, Function<Map.Entry<String, T>, ToXContent> toXContent) {
return wrapWithObject(name, Iterators.map(map.entrySet().iterator(), toXContent));
}

Expand Down

This file was deleted.

Expand Up @@ -61,6 +61,7 @@ public void testAutomaticCancellation() throws Exception {
restRequest.setEntity(new StringEntity("""
{
"sample_size": 10000,
"requested_duration": 33,
"query": {
"bool": {
"filter": [
Expand Down
Expand Up @@ -9,7 +9,7 @@

public class GetFlameGraphActionIT extends ProfilingTestCase {
public void testGetStackTracesUnfiltered() throws Exception {
GetStackTracesRequest request = new GetStackTracesRequest(10, null, null, null);
GetStackTracesRequest request = new GetStackTracesRequest(10, 1.0d, 1.0d, null, null, null);
GetFlamegraphResponse response = client().execute(GetFlamegraphAction.INSTANCE, request).get();
// only spot-check top level properties - detailed tests are done in unit tests
assertEquals(297, response.getSize());
Expand Down
Expand Up @@ -14,14 +14,14 @@

public class GetStackTracesActionIT extends ProfilingTestCase {
public void testGetStackTracesUnfiltered() throws Exception {
GetStackTracesRequest request = new GetStackTracesRequest(10, null, null, null);
GetStackTracesRequest request = new GetStackTracesRequest(10, 1.0d, 1.0d, null, null, null);
request.setAdjustSampleCount(true);
GetStackTracesResponse response = client().execute(GetStackTracesAction.INSTANCE, request).get();
assertEquals(40, response.getTotalSamples());
assertEquals(473, response.getTotalFrames());

assertNotNull(response.getStackTraceEvents());
assertEquals(4L, (long) response.getStackTraceEvents().get("L7kj7UvlKbT-vN73el4faQ"));
assertEquals(4L, response.getStackTraceEvents().get("L7kj7UvlKbT-vN73el4faQ").count);

assertNotNull(response.getStackTraces());
// just do a high-level spot check. Decoding is tested in unit-tests
Expand All @@ -30,6 +30,8 @@ public void testGetStackTracesUnfiltered() throws Exception {
assertEquals(18, stackTrace.fileIds.size());
assertEquals(18, stackTrace.frameIds.size());
assertEquals(18, stackTrace.typeIds.size());
assertEquals(0.007903d, stackTrace.annualCO2Tons, 0.000001d);
assertEquals(74.46d, stackTrace.annualCostsUSD, 0.01d);

assertNotNull(response.getStackFrames());
StackFrame stackFrame = response.getStackFrames().get("8NlMClggx8jaziUTJXlmWAAAAAAAAIYI");
Expand All @@ -42,13 +44,20 @@ public void testGetStackTracesUnfiltered() throws Exception {
public void testGetStackTracesFromAPMWithMatch() throws Exception {
TermQueryBuilder query = QueryBuilders.termQuery("transaction.name", "encodeSha1");

GetStackTracesRequest request = new GetStackTracesRequest(null, query, "apm-test-*", "transaction.profiler_stack_trace_ids");
GetStackTracesRequest request = new GetStackTracesRequest(
null,
1.0d,
1.0d,
query,
"apm-test-*",
"transaction.profiler_stack_trace_ids"
);
GetStackTracesResponse response = client().execute(GetStackTracesAction.INSTANCE, request).get();
assertEquals(43, response.getTotalFrames());

assertNotNull(response.getStackTraceEvents());
assertEquals(3L, (long) response.getStackTraceEvents().get("Ce77w10WeIDow3kd1jowlA"));
assertEquals(2L, (long) response.getStackTraceEvents().get("JvISdnJ47BQ01489cwF9DA"));
assertEquals(3L, response.getStackTraceEvents().get("Ce77w10WeIDow3kd1jowlA").count);
assertEquals(2L, response.getStackTraceEvents().get("JvISdnJ47BQ01489cwF9DA").count);

assertNotNull(response.getStackTraces());
// just do a high-level spot check. Decoding is tested in unit-tests
Expand All @@ -69,7 +78,14 @@ public void testGetStackTracesFromAPMWithMatch() throws Exception {
public void testGetStackTracesFromAPMNoMatch() throws Exception {
TermQueryBuilder query = QueryBuilders.termQuery("transaction.name", "nonExistingTransaction");

GetStackTracesRequest request = new GetStackTracesRequest(null, query, "apm-test-*", "transaction.profiler_stack_trace_ids");
GetStackTracesRequest request = new GetStackTracesRequest(
null,
1.0d,
1.0d,
query,
"apm-test-*",
"transaction.profiler_stack_trace_ids"
);
GetStackTracesResponse response = client().execute(GetStackTracesAction.INSTANCE, request).get();
assertEquals(0, response.getTotalFrames());
}
Expand Down
Expand Up @@ -93,7 +93,7 @@ protected final void indexDoc(String index, String id, Map<String, Object> sourc
}

/**
* @return <code>true</code> iff this test relies that data (and the corresponding indices / data streams) are present for this test.
* @return <code>true</code> iff this test relies on that data (and the corresponding indices / data streams) are present for this test.
*/
protected boolean requiresDataSetup() {
return true;
Expand Down Expand Up @@ -139,6 +139,7 @@ public void setupData() throws Exception {
bulkIndex("data/profiling-stacktraces.ndjson");
bulkIndex("data/profiling-stackframes.ndjson");
bulkIndex("data/profiling-executables.ndjson");
bulkIndex("data/profiling-hosts.ndjson");
bulkIndex("data/apm-test.ndjson");

refresh();
Expand Down
@@ -1,5 +1,5 @@
{"create": {"_index": "profiling-events-all"}}
{"Stacktrace.count": [1], "profiling.project.id": ["100"], "os.kernel": ["9.9.9-0"], "tags": ["environment:qa", "region:eu-west-1"], "host.ip": ["192.168.1.2"], "@timestamp": ["1698624000"], "container.name": ["instance-0000000010"], "ecs.version": ["1.12.0"], "Stacktrace.id": ["S07KmaoGhvNte78xwwRbZQ"], "agent.version": ["head-be593ef3-1688111067"], "host.name": ["ip-192-168-1-2"], "host.id": ["8457605156473051743"], "process.thread.name": ["497295213074376"]}
{"Stacktrace.count": [1], "profiling.project.id": ["100"], "os.kernel": ["9.9.9-0"], "tags": ["environment:qa", "region:eu-west-1"], "host.ip": ["192.168.1.2"], "@timestamp": ["1700504427"], "container.name": ["instance-0000000010"], "ecs.version": ["1.12.0"], "Stacktrace.id": ["S07KmaoGhvNte78xwwRbZQ"], "agent.version": ["head-be593ef3-1688111067"], "host.name": ["ip-192-168-1-2"], "host.id": ["8457605156473051743"], "process.thread.name": ["497295213074376"]}
{"create": {"_index": "profiling-events-all"}}
{"Stacktrace.count": [1], "profiling.project.id": ["100"], "os.kernel": ["9.9.9-0"], "tags": ["environment:qa", "region:eu-west-1"], "host.ip": ["192.168.1.2"], "@timestamp": ["1698624000"], "container.name": ["instance-0000000010"], "ecs.version": ["1.12.0"], "Stacktrace.id": ["4tB_mGJrj1xVuMFbXVYwGA"], "agent.version": ["head-be593ef3-1688111067"], "host.name": ["ip-192-168-1-2"], "host.id": ["8457605156473051743"], "process.thread.name": ["497295213074376"]}
{"create": {"_index": "profiling-events-all"}}
Expand Down
@@ -0,0 +1,2 @@
{"create": {"_index": "profiling-hosts", "_id": "eLH27YsBj2lLi3tJYlvr"}}
{"profiling.project.id": 100, "host.id": "8457605156473051743", "@timestamp": 1700504426, "ecs.version": "1.12.0", "profiling.agent.build_timestamp": 1688111067, "profiling.instance.private_ipv4s": ["192.168.1.2"], "ec2.instance_life_cycle": "on-demand", "profiling.agent.config.map_scale_factor": 0, "ec2.instance_type": "i3.2xlarge", "profiling.host.ip": "192.168.1.2", "profiling.agent.config.bpf_log_level": 0, "profiling.host.sysctl.net.core.bpf_jit_enable": 1, "profiling.agent.config.file": "/etc/prodfiler/prodfiler.conf", "ec2.local_ipv4": "192.168.1.2", "profiling.agent.config.no_kernel_version_check": false, "profiling.host.machine": "x86_64", ",profiling.host.tags": ["cloud_provider:aws", "cloud_environment:qa", "cloud_region:eu-west-1"], "profiling.agent.config.probabilistic_threshold": 100, "profiling.agent.config.disable_tls": false, "profiling.agent.config.tracers": "all", "profiling.agent.start_time": 1700090045589, "profiling.agent.config.max_elements_per_interval": 800, "ec2.placement.region": "eu-west-1", "profiling.agent.config.present_cpu_cores": 8, "profiling.host.kernel_version": "9.9.9-0-aws", "profiling.agent.config.bpf_log_size": 65536, "profiling.agent.config.known_traces_entries": 65536, "profiling.host.sysctl.kernel.unprivileged_bpf_disabled": 1, "profiling.agent.config.verbose": false, "profiling.agent.config.probabilistic_interval": "1m0s", "ec2.placement.availability_zone_id": "euw1-az1", "ec2.security_groups": "", "ec2.local_hostname": "ip-192-168-1-2.eu-west-1.compute.internal", "ec2.placement.availability_zone": "eu-west-1c", "profiling.agent.config.upload_symbols": false, "profiling.host.sysctl.kernel.bpf_stats_enabled": 0, "profiling.host.name": "ip-192-168-1-2", "ec2.mac": "00:11:22:33:44:55", "profiling.host.kernel_proc_version": "Linux version 9.9.9-0-aws", "profiling.agent.config.cache_directory": "/var/cache/optimyze/", "profiling.agent.version": "v8.12.0", "ec2.hostname": "ip-192-168-1-2.eu-west-1.compute.internal", "profiling.agent.config.elastic_mode": false, "ec2.ami_id": "ami-aaaaaaaaaaa", "ec2.instance_id": "i-0b999999999999999" }

0 comments on commit 0cce4ae

Please sign in to comment.