diff --git a/src/tests/tracing/userevents/Directory.Build.props b/src/tests/tracing/userevents/Directory.Build.props
index 67961d29740a70..7a312460939e55 100644
--- a/src/tests/tracing/userevents/Directory.Build.props
+++ b/src/tests/tracing/userevents/Directory.Build.props
@@ -4,5 +4,8 @@
true
+ true
+ true
+ true
diff --git a/src/tests/tracing/userevents/common/UserEventsTestRunner.cs b/src/tests/tracing/userevents/common/UserEventsTestRunner.cs
index 65d2f02ab26dfb..b39e96f57ad8c8 100644
--- a/src/tests/tracing/userevents/common/UserEventsTestRunner.cs
+++ b/src/tests/tracing/userevents/common/UserEventsTestRunner.cs
@@ -17,19 +17,21 @@ namespace Tracing.UserEvents.Tests.Common
public class UserEventsTestRunner
{
private const int SIGINT = 2;
- private const int DefaultTraceeExitTimeoutMs = 60000;
+ private const int DefaultTraceeExitTimeoutMs = 5000;
private const int DefaultRecordTraceExitTimeoutMs = 20000;
- // Delay before starting the tracee to let record-trace finish setup. The
- // tracee's ManualResetEventSlim gates on tracepoint registration via IPC, but
- // that alone is not sufficient as record-trace's ring buffers must also be active
- // (PerfSession::enable) to capture events. Without this delay, the tracee may be
- // discovered during record-trace's /proc scan and receive IPC before enable,
- // causing events to be lost. With the delay, the tracee is instead discovered
- // via MMAP2 records after PerfSession is enabled.
- // record-trace startup -> PerfSession::enable scales with system process count:
- // averaged 113ms at 126 processes and 253ms at 534 processes on a 2-core system.
- private const int RecordTraceSetupDelayMs = 300;
+ // Timeout for record-trace to emit "Recording started" on stdout. record-trace's
+ // startup has two phases: a /proc scan for existing processes, then enabling ring
+ // buffers to capture live mmap events. If the tracee starts during the /proc scan,
+ // record-trace may discover it and send IPC before ring buffers are active, causing
+ // emitted events to be lost. If the tracee starts after the /proc scan but before
+ // ring buffers are enabled, its mmap events are missed entirely and record-trace
+ // never discovers it. By gating on "Recording started" (printed after enable), the
+ // tracee is only discovered via live mmap events with ring buffers already active.
+ // record-trace startup -> enable scales with system process count: averaged 113ms at
+ // 126 processes and 253ms at 534 processes on a 2-core x64 system, but took ~1845ms
+ // on a 2-core ARM64 CI machine.
+ private const int RecordTraceSetupTimeoutMs = 10000;
[DllImport("libc", EntryPoint = "kill", SetLastError = true)]
private static extern int Kill(int pid, int sig);
@@ -65,6 +67,7 @@ public static int Run(
Console.WriteLine("Tracee EventSource enabled, emitting events.");
traceeAction();
+ Console.WriteLine("Tracee finished emitting events.");
return 0;
}
@@ -136,6 +139,17 @@ private static int RunOrchestrator(
recordTraceStartInfo.ArgumentList.Add(traceFilePath);
recordTraceStartInfo.ArgumentList.Add("--log-path");
recordTraceStartInfo.ArgumentList.Add(recordTraceLogPath);
+ recordTraceStartInfo.ArgumentList.Add("--log-filter");
+ recordTraceStartInfo.ArgumentList.Add(
+ "one_collect::helpers::dotnet=debug," +
+ "one_collect::perf_event=debug," +
+ "one_collect::perf_event::rb=info," +
+ "one_collect::helpers::exporting::formats::nettrace=debug," +
+ "one_collect::helpers::exporting::os=warn," +
+ "ruwind=warn," +
+ "one_collect::tracefs=warn," +
+ "one_collect::scripting=warn," +
+ "engine=warn");
recordTraceStartInfo.WorkingDirectory = userEventsScenarioDir;
recordTraceStartInfo.UseShellExecute = false;
recordTraceStartInfo.RedirectStandardOutput = true;
@@ -143,11 +157,16 @@ private static int RunOrchestrator(
Console.WriteLine($"Starting record-trace: {recordTraceStartInfo.FileName} {string.Join(" ", recordTraceStartInfo.ArgumentList)}");
using Process recordTraceProcess = Process.Start(recordTraceStartInfo);
+ using var recordingStarted = new ManualResetEventSlim(false);
recordTraceProcess.OutputDataReceived += (_, args) =>
{
if (!string.IsNullOrEmpty(args.Data))
{
Console.WriteLine($"[record-trace][stdout] {args.Data}");
+ if (args.Data.Contains("Recording started", StringComparison.Ordinal))
+ {
+ recordingStarted.Set();
+ }
}
};
recordTraceProcess.BeginOutputReadLine();
@@ -193,9 +212,25 @@ private static int RunOrchestrator(
// When https://github.com/microsoft/one-collect/issues/183 is fixed, this and the above TMPDIR should be removed.
EnsureCleanDiagnosticPorts(diagnosticPortDir);
- // Allow record-trace to finish setup before starting the tracee.
- Console.WriteLine($"Delaying tracee startup {RecordTraceSetupDelayMs}ms for record-trace setup...");
- Thread.Sleep(RecordTraceSetupDelayMs);
+ // Wait for record-trace to finish setup (capture_environment + enable ring buffers).
+ // "Recording started" is printed after session.enable() succeeds, which means
+ // the ring buffers are active and will capture the tracee's mmap events.
+ Console.WriteLine("Waiting for record-trace to signal 'Recording started'...");
+ if (!recordingStarted.Wait(RecordTraceSetupTimeoutMs))
+ {
+ if (recordTraceProcess.HasExited)
+ {
+ Console.Error.WriteLine($"record-trace exited prematurely with code {recordTraceProcess.ExitCode}.");
+ }
+ else
+ {
+ Console.Error.WriteLine($"record-trace did not emit 'Recording started' within {RecordTraceSetupTimeoutMs}ms.");
+ recordTraceProcess.Kill();
+ }
+
+ UploadArtifactsFromHelixOnFailure(scenarioName, recordTraceLogPath);
+ return -1;
+ }
Console.WriteLine($"Starting tracee process: {traceeStartInfo.FileName} {string.Join(" ", traceeStartInfo.ArgumentList)}");
using Process traceeProcess = Process.Start(traceeStartInfo);
@@ -225,6 +260,7 @@ private static int RunOrchestrator(
traceeProcess.Kill();
}
traceeProcess.WaitForExit(); // flush async output
+ Console.WriteLine($"Tracee process exited with code {traceeProcess.ExitCode}.");
if (!recordTraceProcess.HasExited)
{
@@ -256,6 +292,7 @@ private static int RunOrchestrator(
if (!traceValidator(traceePid, source))
{
Console.Error.WriteLine($"Trace file `{traceFilePath}` does not contain expected events.");
+ DumpTraceeEvents(traceFilePath, traceePid);
UploadArtifactsFromHelixOnFailure(scenarioName, traceFilePath, recordTraceLogPath);
return -1;
}
@@ -371,5 +408,37 @@ private static void UploadArtifactsFromHelixOnFailure(string scenarioName, param
File.Copy(filePath, destPath, overwrite: true);
}
}
+
+ private static void DumpTraceeEvents(string traceFilePath, int traceePid)
+ {
+ try
+ {
+ using EventPipeEventSource diagSource = new EventPipeEventSource(traceFilePath);
+ int traceeEventCount = 0;
+ var eventSummary = new Dictionary();
+
+ diagSource.Dynamic.All += (TraceEvent e) =>
+ {
+ if (e.ProcessID != traceePid)
+ return;
+
+ traceeEventCount++;
+ string key = $"{e.ProviderName}/{e.EventName}";
+ eventSummary[key] = eventSummary.GetValueOrDefault(key) + 1;
+ };
+
+ diagSource.Process();
+
+ Console.Error.WriteLine($"Tracee PID {traceePid} had {traceeEventCount} event(s) in the trace:");
+ foreach (var (key, count) in eventSummary)
+ {
+ Console.Error.WriteLine($" {key}: {count}");
+ }
+ }
+ catch (Exception ex)
+ {
+ Console.Error.WriteLine($"Failed to dump tracee events: {ex}");
+ }
+ }
}
}
\ No newline at end of file