From 5c75526835ca6e3b4c3decfc64d686e7a1fc3f6b Mon Sep 17 00:00:00 2001 From: mdh1418 Date: Mon, 9 Mar 2026 19:21:23 +0000 Subject: [PATCH 1/5] [Tests][UserEvents] Add tracee diagnostic logging Add console messages to the tracee path to confirm event emission completed, and log the tracee exit code in the orchestrator. Add a comment explaining why the tracee timeout is 60s. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/tests/tracing/userevents/common/UserEventsTestRunner.cs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/tests/tracing/userevents/common/UserEventsTestRunner.cs b/src/tests/tracing/userevents/common/UserEventsTestRunner.cs index 65d2f02ab26dfb..8292a25840a1fe 100644 --- a/src/tests/tracing/userevents/common/UserEventsTestRunner.cs +++ b/src/tests/tracing/userevents/common/UserEventsTestRunner.cs @@ -17,6 +17,9 @@ namespace Tracing.UserEvents.Tests.Common public class UserEventsTestRunner { private const int SIGINT = 2; + + // The tracee must JIT compile, receive IPC enable, and emit events before exiting. + // Under JitStress on ARM64, this can take significantly longer than normal. private const int DefaultTraceeExitTimeoutMs = 60000; private const int DefaultRecordTraceExitTimeoutMs = 20000; @@ -65,6 +68,7 @@ public static int Run( Console.WriteLine("Tracee EventSource enabled, emitting events."); traceeAction(); + Console.WriteLine("Tracee finished emitting events."); return 0; } @@ -225,6 +229,7 @@ private static int RunOrchestrator( traceeProcess.Kill(); } traceeProcess.WaitForExit(); // flush async output + Console.WriteLine($"Tracee process exited with code {traceeProcess.ExitCode}."); if (!recordTraceProcess.HasExited) { From 17a4e2d98545f0e6775b7778ea28fef673d77b21 Mon Sep 17 00:00:00 2001 From: mdh1418 Date: Mon, 9 Mar 2026 19:23:20 +0000 Subject: [PATCH 2/5] [Tests][UserEvents] Wait for 'Recording started' instead of fixed delay Replace the 300ms fixed sleep with a synchronization gate that waits for record-trace to emit 'Recording started' on stdout before launching the tracee. This message is printed after PerfSession::enable succeeds, guaranteeing the ring buffers are active and will capture the tracee's mmap events. The fixed delay was insufficient on ARM64 CI machines where record-trace startup (proc scan + ring buffer setup) took ~1845ms, causing the tracee to start in the dead zone between the proc scan and ring buffer enable. If record-trace exits before emitting the signal, report the exit code rather than waiting for the full timeout. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../userevents/common/UserEventsTestRunner.cs | 49 ++++++++++++++----- 1 file changed, 36 insertions(+), 13 deletions(-) diff --git a/src/tests/tracing/userevents/common/UserEventsTestRunner.cs b/src/tests/tracing/userevents/common/UserEventsTestRunner.cs index 8292a25840a1fe..a84ad48f1d68e8 100644 --- a/src/tests/tracing/userevents/common/UserEventsTestRunner.cs +++ b/src/tests/tracing/userevents/common/UserEventsTestRunner.cs @@ -23,16 +23,18 @@ public class UserEventsTestRunner private const int DefaultTraceeExitTimeoutMs = 60000; private const int DefaultRecordTraceExitTimeoutMs = 20000; - // Delay before starting the tracee to let record-trace finish setup. The - // tracee's ManualResetEventSlim gates on tracepoint registration via IPC, but - // that alone is not sufficient as record-trace's ring buffers must also be active - // (PerfSession::enable) to capture events. Without this delay, the tracee may be - // discovered during record-trace's /proc scan and receive IPC before enable, - // causing events to be lost. With the delay, the tracee is instead discovered - // via MMAP2 records after PerfSession is enabled. - // record-trace startup -> PerfSession::enable scales with system process count: - // averaged 113ms at 126 processes and 253ms at 534 processes on a 2-core system. - private const int RecordTraceSetupDelayMs = 300; + // Timeout for record-trace to emit "Recording started" on stdout. record-trace's + // startup has two phases: a /proc scan for existing processes, then enabling ring + // buffers to capture live mmap events. If the tracee starts during the /proc scan, + // record-trace may discover it and send IPC before ring buffers are active, causing + // emitted events to be lost. If the tracee starts after the /proc scan but before + // ring buffers are enabled, its mmap events are missed entirely and record-trace + // never discovers it. By gating on "Recording started" (printed after enable), the + // tracee is only discovered via live mmap events with ring buffers already active. + // record-trace startup -> enable scales with system process count: averaged 113ms at + // 126 processes and 253ms at 534 processes on a 2-core x64 system, but took ~1845ms + // on a 2-core ARM64 CI machine. + private const int RecordTraceSetupTimeoutMs = 10000; [DllImport("libc", EntryPoint = "kill", SetLastError = true)] private static extern int Kill(int pid, int sig); @@ -147,11 +149,16 @@ private static int RunOrchestrator( Console.WriteLine($"Starting record-trace: {recordTraceStartInfo.FileName} {string.Join(" ", recordTraceStartInfo.ArgumentList)}"); using Process recordTraceProcess = Process.Start(recordTraceStartInfo); + using var recordingStarted = new ManualResetEventSlim(false); recordTraceProcess.OutputDataReceived += (_, args) => { if (!string.IsNullOrEmpty(args.Data)) { Console.WriteLine($"[record-trace][stdout] {args.Data}"); + if (args.Data.Contains("Recording started", StringComparison.Ordinal)) + { + recordingStarted.Set(); + } } }; recordTraceProcess.BeginOutputReadLine(); @@ -197,9 +204,25 @@ private static int RunOrchestrator( // When https://github.com/microsoft/one-collect/issues/183 is fixed, this and the above TMPDIR should be removed. EnsureCleanDiagnosticPorts(diagnosticPortDir); - // Allow record-trace to finish setup before starting the tracee. - Console.WriteLine($"Delaying tracee startup {RecordTraceSetupDelayMs}ms for record-trace setup..."); - Thread.Sleep(RecordTraceSetupDelayMs); + // Wait for record-trace to finish setup (capture_environment + enable ring buffers). + // "Recording started" is printed after session.enable() succeeds, which means + // the ring buffers are active and will capture the tracee's mmap events. + Console.WriteLine("Waiting for record-trace to signal 'Recording started'..."); + if (!recordingStarted.Wait(RecordTraceSetupTimeoutMs)) + { + if (recordTraceProcess.HasExited) + { + Console.Error.WriteLine($"record-trace exited prematurely with code {recordTraceProcess.ExitCode}."); + } + else + { + Console.Error.WriteLine($"record-trace did not emit 'Recording started' within {RecordTraceSetupTimeoutMs}ms."); + recordTraceProcess.Kill(); + } + + UploadArtifactsFromHelixOnFailure(scenarioName, recordTraceLogPath); + return -1; + } Console.WriteLine($"Starting tracee process: {traceeStartInfo.FileName} {string.Join(" ", traceeStartInfo.ArgumentList)}"); using Process traceeProcess = Process.Start(traceeStartInfo); From b7b0333931ed7c62f37e9410f3a828e98a1cb491 Mon Sep 17 00:00:00 2001 From: mdh1418 Date: Mon, 9 Mar 2026 19:24:42 +0000 Subject: [PATCH 3/5] [Tests][UserEvents] Improve record-trace log diagnostics Add --log-filter to record-trace to capture debug-level diagnostics for dotnet process discovery (mmap events, IPC enablement), perf session lifecycle (capture_environment, enable/disable, parse_all), and nettrace event block flushing. Suppress noisy modules (ELF metadata loading, stack unwinding, tracefs) to keep logs focused. On validation failure, dump a summary of all events from the tracee PID found in the nettrace (DumpTraceeEvents) to distinguish between 'event not captured' and 'event captured with unexpected metadata'. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../userevents/common/UserEventsTestRunner.cs | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/src/tests/tracing/userevents/common/UserEventsTestRunner.cs b/src/tests/tracing/userevents/common/UserEventsTestRunner.cs index a84ad48f1d68e8..e0c0eeab9abe3e 100644 --- a/src/tests/tracing/userevents/common/UserEventsTestRunner.cs +++ b/src/tests/tracing/userevents/common/UserEventsTestRunner.cs @@ -142,6 +142,17 @@ private static int RunOrchestrator( recordTraceStartInfo.ArgumentList.Add(traceFilePath); recordTraceStartInfo.ArgumentList.Add("--log-path"); recordTraceStartInfo.ArgumentList.Add(recordTraceLogPath); + recordTraceStartInfo.ArgumentList.Add("--log-filter"); + recordTraceStartInfo.ArgumentList.Add( + "one_collect::helpers::dotnet=debug," + + "one_collect::perf_event=debug," + + "one_collect::perf_event::rb=info," + + "one_collect::helpers::exporting::formats::nettrace=debug," + + "one_collect::helpers::exporting::os=warn," + + "ruwind=warn," + + "one_collect::tracefs=warn," + + "one_collect::scripting=warn," + + "engine=warn"); recordTraceStartInfo.WorkingDirectory = userEventsScenarioDir; recordTraceStartInfo.UseShellExecute = false; recordTraceStartInfo.RedirectStandardOutput = true; @@ -284,6 +295,7 @@ private static int RunOrchestrator( if (!traceValidator(traceePid, source)) { Console.Error.WriteLine($"Trace file `{traceFilePath}` does not contain expected events."); + DumpTraceeEvents(traceFilePath, traceePid); UploadArtifactsFromHelixOnFailure(scenarioName, traceFilePath, recordTraceLogPath); return -1; } @@ -399,5 +411,37 @@ private static void UploadArtifactsFromHelixOnFailure(string scenarioName, param File.Copy(filePath, destPath, overwrite: true); } } + + private static void DumpTraceeEvents(string traceFilePath, int traceePid) + { + try + { + using EventPipeEventSource diagSource = new EventPipeEventSource(traceFilePath); + int traceeEventCount = 0; + var eventSummary = new Dictionary(); + + diagSource.Dynamic.All += (TraceEvent e) => + { + if (e.ProcessID != traceePid) + return; + + traceeEventCount++; + string key = $"{e.ProviderName}/{e.EventName}"; + eventSummary[key] = eventSummary.GetValueOrDefault(key) + 1; + }; + + diagSource.Process(); + + Console.Error.WriteLine($"Tracee PID {traceePid} had {traceeEventCount} event(s) in the trace:"); + foreach (var (key, count) in eventSummary) + { + Console.Error.WriteLine($" {key}: {count}"); + } + } + catch (Exception ex) + { + Console.Error.WriteLine($"Failed to dump tracee events: {ex}"); + } + } } } \ No newline at end of file From ca8dbe12e23d622545b864561bfe15f333871b3f Mon Sep 17 00:00:00 2001 From: mdh1418 Date: Tue, 10 Mar 2026 02:39:48 +0000 Subject: [PATCH 4/5] Disable tests for stress configurations --- src/tests/tracing/userevents/Directory.Build.props | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/tests/tracing/userevents/Directory.Build.props b/src/tests/tracing/userevents/Directory.Build.props index 67961d29740a70..7a312460939e55 100644 --- a/src/tests/tracing/userevents/Directory.Build.props +++ b/src/tests/tracing/userevents/Directory.Build.props @@ -4,5 +4,8 @@ true + true + true + true From d3e5928254ca96117a09237e389573e95f98fdb2 Mon Sep 17 00:00:00 2001 From: mdh1418 Date: Tue, 10 Mar 2026 02:48:01 +0000 Subject: [PATCH 5/5] Revert tracee timeout bump as jitstress is disabled --- src/tests/tracing/userevents/common/UserEventsTestRunner.cs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/tests/tracing/userevents/common/UserEventsTestRunner.cs b/src/tests/tracing/userevents/common/UserEventsTestRunner.cs index e0c0eeab9abe3e..b39e96f57ad8c8 100644 --- a/src/tests/tracing/userevents/common/UserEventsTestRunner.cs +++ b/src/tests/tracing/userevents/common/UserEventsTestRunner.cs @@ -17,10 +17,7 @@ namespace Tracing.UserEvents.Tests.Common public class UserEventsTestRunner { private const int SIGINT = 2; - - // The tracee must JIT compile, receive IPC enable, and emit events before exiting. - // Under JitStress on ARM64, this can take significantly longer than normal. - private const int DefaultTraceeExitTimeoutMs = 60000; + private const int DefaultTraceeExitTimeoutMs = 5000; private const int DefaultRecordTraceExitTimeoutMs = 20000; // Timeout for record-trace to emit "Recording started" on stdout. record-trace's