moved average benchmark results to p90 results (#2152)

* moved average benchmark results to p90 results * increased the measured iterations from 15 to 20 and reduced warmups from 3 to 2 by default * removed testOrchestrator option from saucelabs config and gradle * profiling benchmark now prints raw values to the console, to later read them from the log file, but we just assert on the cpu overhead * Benchmarks in SauceLabs will now run on 2 devices with Andorid 12, 3 with Android 11, 2 with Android 10 * added collecting the refresh rate of the device * sdk init duration increase threshold increased to 250 milliseconds * cpu overhead range for the same operation increased to -2%..2% * added a test to send profiles to a Sentry project (dogfooding test)
getsentry · Jul 13, 2022 · f160e0d · f160e0d
1 parent 3ad96de
commit f160e0d
Show file tree

Hide file tree

Showing 18 changed files with 525 additions and 194 deletions.
diff --git a/.sauce/sentry-uitest-android-benchmark.yml b/.sauce/sentry-uitest-android-benchmark.yml
@@ -21,27 +21,25 @@ suites:
   # Devices are chosen so that there is a high-end and a low-end device for each api level
   - name: "Android 12 (api 31)"
     devices:
-      - id: Google_Pixel_6_Pro_real_us # Google Pixel 6 Pro - api 31 (12)
-      - id: Google_Pixel_3_12_real_us # Google Pixel 3 - api 31 (12)
-    testOptions:
-      useTestOrchestrator: true
+      - id: Google_Pixel_6_Pro_real_us # Google Pixel 6 Pro - api 31 (12) - high end
+      - id: Google_Pixel_3_12_real_us # Google Pixel 3 - api 31 (12) - low end
 
   - name: "Android 11 (api 30)"
     devices:
-      - id: OnePlus_9_Pro_real_us # OnePlus 9 Pro - api 30 (11)
-      - id: Google_Pixel_2_real_us # Google Pixel 2 - api 30 (11)
-    testOptions:
-      useTestOrchestrator: true
+      - id: OnePlus_9_Pro_real_us # OnePlus 9 Pro - api 30 (11) - high end
+      - id: Google_Pixel_4_real_us # Google Pixel 4 - api 30 (11) - mid end
+      - id: Google_Pixel_2_real_us # Google Pixel 2 - api 30 (11) - low end
 
-# Commenting for the moment, due to the error "Cannot install test-services-1.4.1.apk on device" on low Android versions
-#  - name: "Android 5 (api 22)"
-#    devices:
-#      - id: Amazon_Kindle_Fire_HD_8_real_us # Amazon Kindle Fire HD 8 - api 22 (5.1.1)
-#    testOptions:
-#      useTestOrchestrator: true
+  - name: "Android 10 (api 29)"
+    devices:
+      - id: Google_Pixel_4_XL_real_us1 # Google Pixel 4 XL - api 29 (10)
+      - id: Nokia_7_1_real_us # Nokia 7.1 - api 29 (10)
 
-#      - id: Google_Pixel_4_XL_real_us1 # Google Pixel 4 XL - api 29 (10)
-#      - id: Motorola_Moto_G_Power_real_us # Motorola Moto G Power (2021) - api 29 (10)
+# At the time of writing (July, 4, 2022), the market share per android version is:
+# 12.0 = 17.54%, 11.0 = 31.65%, 10.0 = 21.92%
+# Using these 3 versions we cover 71,11% of all devices out there. Currently, this is enough for benchmarking scope
+# Leaving these devices here in case we change mind on them
+#    devices:
 #      - id: Samsung_Galaxy_S8_plus_real_us # Samsung Galaxy S8+ - api 28 (9)
 #      - id: LG_G8_ThinQ_real_us # LG G8 ThinQ - api 28 (9)
 #      - id: OnePlus_5_real_us # OnePlus 5 - api 27 (8.1.0)
@@ -54,6 +52,7 @@ suites:
 #      - id: LG_K10_real # LG K10 - api 24 (7.0)
 #      - id: Samsung_Galaxy_S6_Edge_Plus_real # Samsung Galaxy S6 Edge+ - api 23 (6.0.1)
 #      - id: Samsung_Tab_E_real_us # Samsung Tab E - api 23 (6.0.1)
+#      - id: Amazon_Kindle_Fire_HD_8_real_us # Amazon Kindle Fire HD 8 - api 22 (5.1.1)
 
 artifacts:
   download:

diff --git a/.sauce/sentry-uitest-android-ui.yml b/.sauce/sentry-uitest-android-ui.yml
@@ -17,8 +17,6 @@ suites:
   - name: "Android 12 (api 31)"
     devices:
       - id: Samsung_Galaxy_S22_Ultra_5G_real_us # Samsung Galaxy S22 Ultra 5G - api 31 (12)
-    testOptions:
-      useTestOrchestrator: true
 
 # Controls what artifacts to fetch when the suite on Sauce Cloud has finished.
 artifacts:

diff --git a/sentry-android-integration-tests/sentry-uitest-android-benchmark/build.gradle.kts b/sentry-android-integration-tests/sentry-uitest-android-benchmark/build.gradle.kts
@@ -25,6 +25,7 @@ android {
         // https://developer.android.com/training/testing/instrumented-tests/androidx-test-libraries/runner#enable-gradle
         // This doesn't work on some devices with Android 11+. Clearing package data resets permissions.
         // Check the readme for more info.
+        // Test orchestrator was removed due to issues with SauceLabs
 //        testInstrumentationRunnerArguments["clearPackageData"] = "true"
     }
 
@@ -34,10 +35,6 @@ android {
         viewBinding = true
     }
 
-    testOptions {
-        execution = "ANDROIDX_TEST_ORCHESTRATOR"
-    }
-
     signingConfigs {
         getByName("debug") {
             storeFile = rootProject.file("debug.keystore")
@@ -51,7 +48,6 @@ android {
 
     buildTypes {
         getByName("debug") {
-            isDebuggable = false
             isMinifyEnabled = true
             signingConfig = signingConfigs.getByName("debug")
             proguardFiles(getDefaultProguardFile("proguard-android-optimize.txt"), "benchmark-proguard-rules.pro")

diff --git a/...id-benchmark/src/androidTest/java/io/sentry/uitest/android/benchmark/BaseBenchmarkTest.kt b/...id-benchmark/src/androidTest/java/io/sentry/uitest/android/benchmark/BaseBenchmarkTest.kt
@@ -2,7 +2,9 @@ package io.sentry.uitest.android.benchmark
 
 import android.content.Context
 import android.view.Choreographer
+import androidx.lifecycle.Lifecycle
 import androidx.test.core.app.ApplicationProvider
+import androidx.test.core.app.launchActivity
 import androidx.test.platform.app.InstrumentationRegistry
 import androidx.test.runner.AndroidJUnitRunner
 import kotlin.test.BeforeTest
@@ -22,5 +24,8 @@ abstract class BaseBenchmarkTest {
         runner.runOnMainSync {
             choreographer = Choreographer.getInstance()
         }
+        // We need the refresh rate, but we can get it only from the activity, so we start and destroy one
+        val benchmarkScenario = launchActivity<BenchmarkActivity>()
+        benchmarkScenario.moveToState(Lifecycle.State.DESTROYED)
     }
 }
diff --git a/...oid-benchmark/src/androidTest/java/io/sentry/uitest/android/benchmark/SdkBenchmarkTest.kt b/...oid-benchmark/src/androidTest/java/io/sentry/uitest/android/benchmark/SdkBenchmarkTest.kt
@@ -32,14 +32,19 @@ class SdkBenchmarkTest : BaseBenchmarkTest() {
                 it.tracesSampleRate = 1.0
             }
         }
-        val simpleSdkResult = BenchmarkOperation.compare(opNoSdk, "No Sdk", opSimpleSdk, "Simple Sdk")
-        val perfProfilingSdkResult = BenchmarkOperation.compare(opNoSdk2, "No Sdk", opPerfProfilingSdk, "Sdk with perf and profiling")
+        val refreshRate = BenchmarkActivity.refreshRate ?: 60F
+        val simpleSdkResults = BenchmarkOperation.compare(opNoSdk, "No Sdk", opSimpleSdk, "Simple Sdk", refreshRate)
+        val simpleSdkResult = simpleSdkResults.getSummaryResult()
+        simpleSdkResult.printResults()
+        val perfProfilingSdkResults = BenchmarkOperation.compare(opNoSdk2, "No Sdk", opPerfProfilingSdk, "Sdk with perf and profiling", refreshRate)
+        val perfProfilingSdkResult = perfProfilingSdkResults.getSummaryResult()
+        perfProfilingSdkResult.printResults()
 
-        val maxDurationThreshold = TimeUnit.MILLISECONDS.toNanos(100)
+        val maxDurationThreshold = TimeUnit.MILLISECONDS.toNanos(250)
         assertTrue(simpleSdkResult.durationIncreaseNanos in 0..maxDurationThreshold)
         assertTrue(simpleSdkResult.cpuTimeIncreaseMillis in 0..100)
-        assertTrue(perfProfilingSdkResult.durationIncreaseNanos in simpleSdkResult.durationIncreaseNanos..maxDurationThreshold)
-        assertTrue(perfProfilingSdkResult.cpuTimeIncreaseMillis in simpleSdkResult.cpuTimeIncreaseMillis..100)
+        assertTrue(perfProfilingSdkResult.durationIncreaseNanos in 0..maxDurationThreshold)
+        assertTrue(perfProfilingSdkResult.cpuTimeIncreaseMillis in 0..100)
     }
 
     private fun getOperation(init: (() -> Unit)? = null) = BenchmarkOperation(

diff --git a/...-benchmark/src/androidTest/java/io/sentry/uitest/android/benchmark/SentryBenchmarkTest.kt b/...-benchmark/src/androidTest/java/io/sentry/uitest/android/benchmark/SentryBenchmarkTest.kt
@@ -40,13 +40,15 @@ class SentryBenchmarkTest : BaseBenchmarkTest() {
         // should be very similar.
         val op1 = BenchmarkOperation(choreographer, op = getOperation(runner))
         val op2 = BenchmarkOperation(choreographer, op = getOperation(runner))
-        val comparisonResult = BenchmarkOperation.compare(op1, "Op1", op2, "Op2")
+        val refreshRate = BenchmarkActivity.refreshRate ?: 60F
+        val comparisonResults = BenchmarkOperation.compare(op1, "Op1", op2, "Op2", refreshRate)
+        val comparisonResult = comparisonResults.getSummaryResult()
+        comparisonResult.printResults()
 
-        assertTrue(comparisonResult.durationIncreasePercentage in -1F..1F)
-        assertTrue(comparisonResult.cpuTimeIncreasePercentage in -1F..1F)
+        // Currently we just want to assert the cpu overhead
+        assertTrue(comparisonResult.cpuTimeIncreasePercentage in -2F..2F)
         // The fps decrease comparison is skipped, due to approximation: 59.51 and 59.49 fps are considered 60 and 59,
         // respectively. Also, if the average fps is 20 or 60, a difference of 1 fps becomes 5% or 1.66% respectively.
-        assertTrue(comparisonResult.droppedFramesIncreasePercentage in -1F..1F)
     }
 
     @Test
@@ -76,17 +78,20 @@ class SentryBenchmarkTest : BaseBenchmarkTest() {
                 }
             }
         )
-        val comparisonResult = BenchmarkOperation.compare(
+        val refreshRate = BenchmarkActivity.refreshRate ?: 60F
+        val comparisonResults = BenchmarkOperation.compare(
             benchmarkOperationNoTransaction,
             "NoTransaction",
             benchmarkOperationProfiled,
-            "ProfiledTransaction"
+            "ProfiledTransaction",
+            refreshRate
         )
+        comparisonResults.printAllRuns("Profiling Benchmark")
+        val comparisonResult = comparisonResults.getSummaryResult()
+        comparisonResult.printResults()
 
-        assertTrue(comparisonResult.durationIncreasePercentage in 0F..5F)
+        // Currently we just want to assert the cpu overhead
         assertTrue(comparisonResult.cpuTimeIncreasePercentage in 0F..5F)
-        assertTrue(comparisonResult.fpsDecreasePercentage in 0F..5F)
-        assertTrue(comparisonResult.droppedFramesIncreasePercentage in 0F..5F)
     }
 
     /**
@@ -103,12 +108,10 @@ class SentryBenchmarkTest : BaseBenchmarkTest() {
         }
         // Just swipe the list some times: this is the benchmarked operation
         swipeList(2)
-        // We finish the transaction
+        // We finish the transaction. We do it on main thread, so there's no need to perform other operations after it
         runner.runOnMainSync {
             transaction?.finish()
         }
-        // We swipe a last time to measure how finishing the transaction may affect other operations
-        swipeList(1)
 
         benchmarkScenario.moveToState(Lifecycle.State.DESTROYED)
     }

diff --git a/...src/androidTest/java/io/sentry/uitest/android/benchmark/util/BenchmarkComparisonResult.kt b/...src/androidTest/java/io/sentry/uitest/android/benchmark/util/BenchmarkComparisonResult.kt
@@ -0,0 +1,153 @@
+package io.sentry.uitest.android.benchmark.util
+
+import java.util.concurrent.TimeUnit
+
+/** Result of the [BenchmarkOperation] comparison. */
+internal data class BenchmarkComparisonResult(
+    /** Number of measured iterations. */
+    val iterations: Int,
+    /** Screen refresh rate. */
+    val refreshRate: Float,
+    /** Screen refresh rate. */
+    val cores: Int,
+    /** Name of the first compared operation. */
+    val op1Name: String,
+    /** Name of the second compared operation. */
+    val op2Name: String,
+    /** Raw cpu time in milliseconds of op1. */
+    val op1CpuTime: List<Long>,
+    /** Raw cpu time in milliseconds of op2. */
+    val op2CpuTime: List<Long>,
+    /** Increase of cpu time in milliseconds. */
+    val cpuTimeIncreases: List<Long>,
+    /** Increase of cpu time in percentage. */
+    val cpuTimeIncreasePercentages: List<Double>,
+    /** Raw dropped frames of op1. */
+    val op1DroppedFrames: List<Double>,
+    /** Raw dropped frames of op2. */
+    val op2DroppedFrames: List<Double>,
+    /** Increase of dropped frames. */
+    val droppedFramesIncreases: List<Double>,
+    /** Increase of dropped frames in percentage. */
+    val droppedFramesIncreasePercentages: List<Double>,
+    /** Raw duration in nanoseconds of op1. */
+    val op1Duration: List<Long>,
+    /** Raw duration in nanoseconds of op2. */
+    val op2Duration: List<Long>,
+    /** Increase of duration in nanoseconds. If it's low enough, no end user will ever realize it. */
+    val durationIncreaseNanos: List<Long>,
+    /** Increase of duration in percentage. */
+    val durationIncreasePercentage: List<Double>,
+    /** Raw fps of op1. */
+    val op1Fps: List<Int>,
+    /** Raw fps of op2. */
+    val op2Fps: List<Int>,
+    /** Decrease of fps. */
+    val fpsDecreases: List<Int>,
+    /** Decrease of fps in percentage. */
+    val fpsDecreasePercentages: List<Double>
+) {
+
+    /**
+     * Prints the raw results of all runs of the comparison.
+     * Each printed line is prefixed by [prefix], to allow parsers to easily parse log files to read raw values.
+     */
+    fun printAllRuns(prefix: String) {
+        repeat(iterations) { index ->
+
+            println("$prefix ==================== Iteration $index ====================")
+
+            println("$prefix [$op2Name]: duration=${op2Duration[index]} ns, cpuTime=${op2CpuTime[index]}, fps=${op2Fps[index]}, droppedFrames=${op2DroppedFrames[index]}")
+            println("$prefix [$op1Name]: duration=${op1Duration[index]} ns, cpuTime=${op1CpuTime[index]}, fps=${op1Fps[index]}, droppedFrames=${op1DroppedFrames[index]}")
+            println(
+                "$prefix Duration increase: %.2f%% (%d ns = %d ms)".format(
+                    durationIncreasePercentage[index],
+                    durationIncreaseNanos[index],
+                    TimeUnit.NANOSECONDS.toMillis(durationIncreaseNanos[index])
+                )
+            )
+
+            println(
+                "$prefix CPU time overhead, over $cores cores: %.2f%% (%d ms)".format(
+                    cpuTimeIncreasePercentages[index],
+                    cpuTimeIncreases[index]
+                )
+            )
+
+            println("$prefix FPS decrease: %.2f%% (%d fps)".format(fpsDecreasePercentages[index], fpsDecreases[index]))
+
+            val expectedFrames = TimeUnit.NANOSECONDS.toMillis(op2Duration[index]) * refreshRate / 1000
+            println(
+                "$prefix Frame drop increase, over $expectedFrames total frames, with $refreshRate hz: %.2f%% (%.2f)".format(
+                    droppedFramesIncreasePercentages[index],
+                    droppedFramesIncreases[index]
+                )
+            )
+        }
+    }
+
+    fun getSummaryResult() = BenchmarkSummaryResult(
+        calculatePercentile(cpuTimeIncreases, 90),
+        calculatePercentile(cpuTimeIncreasePercentages, 90),
+        calculatePercentile(droppedFramesIncreases, 90),
+        calculatePercentile(droppedFramesIncreasePercentages, 90),
+        calculatePercentile(durationIncreaseNanos, 90),
+        calculatePercentile(durationIncreasePercentage, 90),
+        calculatePercentile(fpsDecreases, 90),
+        calculatePercentile(fpsDecreasePercentages, 90)
+    )
+
+    /** Calculate the [percentile] of the [list]. [percentile] should be in the range 0, 100. */
+    private fun <T : Number> calculatePercentile(list: List<T>, percentile: Int): T {
+        if (list.isEmpty()) {
+            return 0 as T
+        }
+        val sortedList = list.sortedBy { it.toDouble() }
+        val percentileIndex = (list.size * percentile / 100 - 1).coerceIn(0, list.size)
+        return sortedList[percentileIndex]
+    }
+}
+
+/** Result of the [BenchmarkOperation] comparison. */
+internal data class BenchmarkSummaryResult(
+    /**
+     * Increase of cpu time in milliseconds.
+     * It has no direct impact on performance of the app, but it has on battery usage, as the cpu is 'awaken' longer.
+     */
+    val cpuTimeIncreaseMillis: Long,
+    /** Increase of cpu time in percentage. */
+    val cpuTimeIncreasePercentage: Double,
+    /**
+     * Increase of dropped frames.Very important, as it weights dropped frames based on the time
+     * passed between each frame. This is the metric end users can perceive as 'performance' in app usage.
+     */
+    val droppedFramesIncrease: Double,
+    /** Increase of dropped frames in percentage. */
+    val droppedFramesIncreasePercentage: Double,
+    /** Increase of duration in nanoseconds. If it's low enough, no end user will ever realize it. */
+    val durationIncreaseNanos: Long,
+    /** Increase of duration in percentage. */
+    val durationIncreasePercentage: Double,
+    /**
+     * Decrease of fps. Not really important, as even if fps are the same, the cpu could be
+     * doing more work in the frame window, and it could be hidden by checking average fps only.
+     */
+    val fpsDecrease: Int,
+    /** Decrease of fps in percentage. */
+    val fpsDecreasePercentage: Double
+) {
+
+    /** Prints the summary results of the comparison. */
+    fun printResults() {
+        println(
+            "Duration increase: %.2f%% (%d ns = %d ms)".format(
+                durationIncreasePercentage,
+                durationIncreaseNanos,
+                TimeUnit.NANOSECONDS.toMillis(durationIncreaseNanos)
+            )
+        )
+        println("CPU time overhead: %.2f%% (%d ms)".format(cpuTimeIncreasePercentage, cpuTimeIncreaseMillis))
+        println("FPS decrease: %.2f%% (%d fps)".format(fpsDecreasePercentage, fpsDecrease))
+        println("Frame drop increase: %.2f%% (%.2f)".format(droppedFramesIncreasePercentage, droppedFramesIncrease))
+    }
+}