Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/tests/tracing/userevents/Directory.Build.props
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,8 @@

<PropertyGroup>
<CLRTestTargetUnsupported Condition="'$(RuntimeFlavor)' == 'Mono'">true</CLRTestTargetUnsupported>
<GCStressIncompatible>true</GCStressIncompatible>
<JitOptimizationSensitive>true</JitOptimizationSensitive>
<UnloadabilityIncompatible>true</UnloadabilityIncompatible>
</PropertyGroup>
</Project>
97 changes: 83 additions & 14 deletions src/tests/tracing/userevents/common/UserEventsTestRunner.cs
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,21 @@ namespace Tracing.UserEvents.Tests.Common
public class UserEventsTestRunner
{
private const int SIGINT = 2;
private const int DefaultTraceeExitTimeoutMs = 60000;
private const int DefaultTraceeExitTimeoutMs = 5000;
Comment thread
mdh1418 marked this conversation as resolved.
private const int DefaultRecordTraceExitTimeoutMs = 20000;

// Delay before starting the tracee to let record-trace finish setup. The
// tracee's ManualResetEventSlim gates on tracepoint registration via IPC, but
// that alone is not sufficient as record-trace's ring buffers must also be active
// (PerfSession::enable) to capture events. Without this delay, the tracee may be
// discovered during record-trace's /proc scan and receive IPC before enable,
// causing events to be lost. With the delay, the tracee is instead discovered
// via MMAP2 records after PerfSession is enabled.
// record-trace startup -> PerfSession::enable scales with system process count:
// averaged 113ms at 126 processes and 253ms at 534 processes on a 2-core system.
private const int RecordTraceSetupDelayMs = 300;
// Timeout for record-trace to emit "Recording started" on stdout. record-trace's
// startup has two phases: a /proc scan for existing processes, then enabling ring
// buffers to capture live mmap events. If the tracee starts during the /proc scan,
// record-trace may discover it and send IPC before ring buffers are active, causing
// emitted events to be lost. If the tracee starts after the /proc scan but before
// ring buffers are enabled, its mmap events are missed entirely and record-trace
// never discovers it. By gating on "Recording started" (printed after enable), the
// tracee is only discovered via live mmap events with ring buffers already active.
// record-trace startup -> enable scales with system process count: averaged 113ms at
// 126 processes and 253ms at 534 processes on a 2-core x64 system, but took ~1845ms
// on a 2-core ARM64 CI machine.
private const int RecordTraceSetupTimeoutMs = 10000;

[DllImport("libc", EntryPoint = "kill", SetLastError = true)]
private static extern int Kill(int pid, int sig);
Expand Down Expand Up @@ -65,6 +67,7 @@ public static int Run(
Console.WriteLine("Tracee EventSource enabled, emitting events.");

traceeAction();
Console.WriteLine("Tracee finished emitting events.");
return 0;
}

Expand Down Expand Up @@ -136,18 +139,34 @@ private static int RunOrchestrator(
recordTraceStartInfo.ArgumentList.Add(traceFilePath);
recordTraceStartInfo.ArgumentList.Add("--log-path");
recordTraceStartInfo.ArgumentList.Add(recordTraceLogPath);
recordTraceStartInfo.ArgumentList.Add("--log-filter");
recordTraceStartInfo.ArgumentList.Add(
"one_collect::helpers::dotnet=debug," +
"one_collect::perf_event=debug," +
"one_collect::perf_event::rb=info," +
"one_collect::helpers::exporting::formats::nettrace=debug," +
"one_collect::helpers::exporting::os=warn," +
"ruwind=warn," +
"one_collect::tracefs=warn," +
"one_collect::scripting=warn," +
"engine=warn");
recordTraceStartInfo.WorkingDirectory = userEventsScenarioDir;
recordTraceStartInfo.UseShellExecute = false;
recordTraceStartInfo.RedirectStandardOutput = true;
recordTraceStartInfo.RedirectStandardError = true;

Console.WriteLine($"Starting record-trace: {recordTraceStartInfo.FileName} {string.Join(" ", recordTraceStartInfo.ArgumentList)}");
using Process recordTraceProcess = Process.Start(recordTraceStartInfo);
using var recordingStarted = new ManualResetEventSlim(false);
recordTraceProcess.OutputDataReceived += (_, args) =>
{
if (!string.IsNullOrEmpty(args.Data))
{
Console.WriteLine($"[record-trace][stdout] {args.Data}");
if (args.Data.Contains("Recording started", StringComparison.Ordinal))
{
recordingStarted.Set();
}
}
};
recordTraceProcess.BeginOutputReadLine();
Expand Down Expand Up @@ -193,9 +212,25 @@ private static int RunOrchestrator(
// When https://github.com/microsoft/one-collect/issues/183 is fixed, this and the above TMPDIR should be removed.
EnsureCleanDiagnosticPorts(diagnosticPortDir);

// Allow record-trace to finish setup before starting the tracee.
Console.WriteLine($"Delaying tracee startup {RecordTraceSetupDelayMs}ms for record-trace setup...");
Thread.Sleep(RecordTraceSetupDelayMs);
// Wait for record-trace to finish setup (capture_environment + enable ring buffers).
// "Recording started" is printed after session.enable() succeeds, which means
// the ring buffers are active and will capture the tracee's mmap events.
Console.WriteLine("Waiting for record-trace to signal 'Recording started'...");
if (!recordingStarted.Wait(RecordTraceSetupTimeoutMs))
{
if (recordTraceProcess.HasExited)
{
Console.Error.WriteLine($"record-trace exited prematurely with code {recordTraceProcess.ExitCode}.");
}
else
{
Console.Error.WriteLine($"record-trace did not emit 'Recording started' within {RecordTraceSetupTimeoutMs}ms.");
recordTraceProcess.Kill();
}

UploadArtifactsFromHelixOnFailure(scenarioName, recordTraceLogPath);
return -1;
}

Console.WriteLine($"Starting tracee process: {traceeStartInfo.FileName} {string.Join(" ", traceeStartInfo.ArgumentList)}");
using Process traceeProcess = Process.Start(traceeStartInfo);
Expand Down Expand Up @@ -225,6 +260,7 @@ private static int RunOrchestrator(
traceeProcess.Kill();
}
traceeProcess.WaitForExit(); // flush async output
Console.WriteLine($"Tracee process exited with code {traceeProcess.ExitCode}.");

if (!recordTraceProcess.HasExited)
{
Expand Down Expand Up @@ -256,6 +292,7 @@ private static int RunOrchestrator(
if (!traceValidator(traceePid, source))
{
Console.Error.WriteLine($"Trace file `{traceFilePath}` does not contain expected events.");
DumpTraceeEvents(traceFilePath, traceePid);
UploadArtifactsFromHelixOnFailure(scenarioName, traceFilePath, recordTraceLogPath);
return -1;
}
Expand Down Expand Up @@ -371,5 +408,37 @@ private static void UploadArtifactsFromHelixOnFailure(string scenarioName, param
File.Copy(filePath, destPath, overwrite: true);
}
}

private static void DumpTraceeEvents(string traceFilePath, int traceePid)
{
try
{
using EventPipeEventSource diagSource = new EventPipeEventSource(traceFilePath);
int traceeEventCount = 0;
var eventSummary = new Dictionary<string, int>();

diagSource.Dynamic.All += (TraceEvent e) =>
{
if (e.ProcessID != traceePid)
return;

traceeEventCount++;
string key = $"{e.ProviderName}/{e.EventName}";
eventSummary[key] = eventSummary.GetValueOrDefault(key) + 1;
};

diagSource.Process();

Console.Error.WriteLine($"Tracee PID {traceePid} had {traceeEventCount} event(s) in the trace:");
foreach (var (key, count) in eventSummary)
{
Console.Error.WriteLine($" {key}: {count}");
}
}
catch (Exception ex)
{
Console.Error.WriteLine($"Failed to dump tracee events: {ex}");
}
}
}
}