diff --git a/src/tests/tracing/userevents/Directory.Build.props b/src/tests/tracing/userevents/Directory.Build.props index 67961d29740a70..7a312460939e55 100644 --- a/src/tests/tracing/userevents/Directory.Build.props +++ b/src/tests/tracing/userevents/Directory.Build.props @@ -4,5 +4,8 @@ true + true + true + true diff --git a/src/tests/tracing/userevents/common/UserEventsTestRunner.cs b/src/tests/tracing/userevents/common/UserEventsTestRunner.cs index 65d2f02ab26dfb..b39e96f57ad8c8 100644 --- a/src/tests/tracing/userevents/common/UserEventsTestRunner.cs +++ b/src/tests/tracing/userevents/common/UserEventsTestRunner.cs @@ -17,19 +17,21 @@ namespace Tracing.UserEvents.Tests.Common public class UserEventsTestRunner { private const int SIGINT = 2; - private const int DefaultTraceeExitTimeoutMs = 60000; + private const int DefaultTraceeExitTimeoutMs = 5000; private const int DefaultRecordTraceExitTimeoutMs = 20000; - // Delay before starting the tracee to let record-trace finish setup. The - // tracee's ManualResetEventSlim gates on tracepoint registration via IPC, but - // that alone is not sufficient as record-trace's ring buffers must also be active - // (PerfSession::enable) to capture events. Without this delay, the tracee may be - // discovered during record-trace's /proc scan and receive IPC before enable, - // causing events to be lost. With the delay, the tracee is instead discovered - // via MMAP2 records after PerfSession is enabled. - // record-trace startup -> PerfSession::enable scales with system process count: - // averaged 113ms at 126 processes and 253ms at 534 processes on a 2-core system. - private const int RecordTraceSetupDelayMs = 300; + // Timeout for record-trace to emit "Recording started" on stdout. record-trace's + // startup has two phases: a /proc scan for existing processes, then enabling ring + // buffers to capture live mmap events. If the tracee starts during the /proc scan, + // record-trace may discover it and send IPC before ring buffers are active, causing + // emitted events to be lost. If the tracee starts after the /proc scan but before + // ring buffers are enabled, its mmap events are missed entirely and record-trace + // never discovers it. By gating on "Recording started" (printed after enable), the + // tracee is only discovered via live mmap events with ring buffers already active. + // record-trace startup -> enable scales with system process count: averaged 113ms at + // 126 processes and 253ms at 534 processes on a 2-core x64 system, but took ~1845ms + // on a 2-core ARM64 CI machine. + private const int RecordTraceSetupTimeoutMs = 10000; [DllImport("libc", EntryPoint = "kill", SetLastError = true)] private static extern int Kill(int pid, int sig); @@ -65,6 +67,7 @@ public static int Run( Console.WriteLine("Tracee EventSource enabled, emitting events."); traceeAction(); + Console.WriteLine("Tracee finished emitting events."); return 0; } @@ -136,6 +139,17 @@ private static int RunOrchestrator( recordTraceStartInfo.ArgumentList.Add(traceFilePath); recordTraceStartInfo.ArgumentList.Add("--log-path"); recordTraceStartInfo.ArgumentList.Add(recordTraceLogPath); + recordTraceStartInfo.ArgumentList.Add("--log-filter"); + recordTraceStartInfo.ArgumentList.Add( + "one_collect::helpers::dotnet=debug," + + "one_collect::perf_event=debug," + + "one_collect::perf_event::rb=info," + + "one_collect::helpers::exporting::formats::nettrace=debug," + + "one_collect::helpers::exporting::os=warn," + + "ruwind=warn," + + "one_collect::tracefs=warn," + + "one_collect::scripting=warn," + + "engine=warn"); recordTraceStartInfo.WorkingDirectory = userEventsScenarioDir; recordTraceStartInfo.UseShellExecute = false; recordTraceStartInfo.RedirectStandardOutput = true; @@ -143,11 +157,16 @@ private static int RunOrchestrator( Console.WriteLine($"Starting record-trace: {recordTraceStartInfo.FileName} {string.Join(" ", recordTraceStartInfo.ArgumentList)}"); using Process recordTraceProcess = Process.Start(recordTraceStartInfo); + using var recordingStarted = new ManualResetEventSlim(false); recordTraceProcess.OutputDataReceived += (_, args) => { if (!string.IsNullOrEmpty(args.Data)) { Console.WriteLine($"[record-trace][stdout] {args.Data}"); + if (args.Data.Contains("Recording started", StringComparison.Ordinal)) + { + recordingStarted.Set(); + } } }; recordTraceProcess.BeginOutputReadLine(); @@ -193,9 +212,25 @@ private static int RunOrchestrator( // When https://github.com/microsoft/one-collect/issues/183 is fixed, this and the above TMPDIR should be removed. EnsureCleanDiagnosticPorts(diagnosticPortDir); - // Allow record-trace to finish setup before starting the tracee. - Console.WriteLine($"Delaying tracee startup {RecordTraceSetupDelayMs}ms for record-trace setup..."); - Thread.Sleep(RecordTraceSetupDelayMs); + // Wait for record-trace to finish setup (capture_environment + enable ring buffers). + // "Recording started" is printed after session.enable() succeeds, which means + // the ring buffers are active and will capture the tracee's mmap events. + Console.WriteLine("Waiting for record-trace to signal 'Recording started'..."); + if (!recordingStarted.Wait(RecordTraceSetupTimeoutMs)) + { + if (recordTraceProcess.HasExited) + { + Console.Error.WriteLine($"record-trace exited prematurely with code {recordTraceProcess.ExitCode}."); + } + else + { + Console.Error.WriteLine($"record-trace did not emit 'Recording started' within {RecordTraceSetupTimeoutMs}ms."); + recordTraceProcess.Kill(); + } + + UploadArtifactsFromHelixOnFailure(scenarioName, recordTraceLogPath); + return -1; + } Console.WriteLine($"Starting tracee process: {traceeStartInfo.FileName} {string.Join(" ", traceeStartInfo.ArgumentList)}"); using Process traceeProcess = Process.Start(traceeStartInfo); @@ -225,6 +260,7 @@ private static int RunOrchestrator( traceeProcess.Kill(); } traceeProcess.WaitForExit(); // flush async output + Console.WriteLine($"Tracee process exited with code {traceeProcess.ExitCode}."); if (!recordTraceProcess.HasExited) { @@ -256,6 +292,7 @@ private static int RunOrchestrator( if (!traceValidator(traceePid, source)) { Console.Error.WriteLine($"Trace file `{traceFilePath}` does not contain expected events."); + DumpTraceeEvents(traceFilePath, traceePid); UploadArtifactsFromHelixOnFailure(scenarioName, traceFilePath, recordTraceLogPath); return -1; } @@ -371,5 +408,37 @@ private static void UploadArtifactsFromHelixOnFailure(string scenarioName, param File.Copy(filePath, destPath, overwrite: true); } } + + private static void DumpTraceeEvents(string traceFilePath, int traceePid) + { + try + { + using EventPipeEventSource diagSource = new EventPipeEventSource(traceFilePath); + int traceeEventCount = 0; + var eventSummary = new Dictionary(); + + diagSource.Dynamic.All += (TraceEvent e) => + { + if (e.ProcessID != traceePid) + return; + + traceeEventCount++; + string key = $"{e.ProviderName}/{e.EventName}"; + eventSummary[key] = eventSummary.GetValueOrDefault(key) + 1; + }; + + diagSource.Process(); + + Console.Error.WriteLine($"Tracee PID {traceePid} had {traceeEventCount} event(s) in the trace:"); + foreach (var (key, count) in eventSummary) + { + Console.Error.WriteLine($" {key}: {count}"); + } + } + catch (Exception ex) + { + Console.Error.WriteLine($"Failed to dump tracee events: {ex}"); + } + } } } \ No newline at end of file