dpsoft · dpsoft · Jan 26, 2026 · Jan 26, 2026 · Jan 26, 2026 · Jan 26, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -121,4 +121,5 @@ jobs:
     - name: Run integration tests
       run: |
         cd test
-        sudo -E go test -v -timeout 5m ./...
+        sudo env LD_LIBRARY_PATH="/usr/local/lib:$LD_LIBRARY_PATH" \
+          go test -v -timeout 5m ./...
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -140,7 +140,8 @@ jobs:
       - name: Run integration tests
         run: |
           cd test
-          sudo -E $(which go) test -v -timeout 15m ./...
+          sudo env LD_LIBRARY_PATH="/usr/local/lib:$LD_LIBRARY_PATH" \
+            $(which go) test -v -timeout 15m ./...
 
   lint:
     name: Lint

diff --git a/Makefile b/Makefile
@@ -31,7 +31,10 @@ test-unit: generate
 
 .PHONY: test-integration
 test-integration: build test-workloads
-	cd test && CGO_CFLAGS="-I$(LIBBLAZESYM_INC)" CGO_LDFLAGS="-L$(abspath $(LIBBLAZESYM_SRC)/target/release)" bash run_tests.sh
+	cd test && LD_LIBRARY_PATH="$(abspath $(LIBBLAZESYM_SRC)/target/release):$$LD_LIBRARY_PATH" \
+		CGO_CFLAGS="-I /usr/include/bpf -I /usr/include/pcap -I$(LIBBLAZESYM_INC)" \
+		CGO_LDFLAGS="-L$(abspath $(LIBBLAZESYM_SRC)/target/release) -Wl,-Bstatic -lblazesym_c -Wl,-Bdynamic" \
+		bash run_tests.sh
 
 .PHONY: test
 test: test-unit test-integration

diff --git a/README.md b/README.md
@@ -141,11 +141,95 @@ go tool pprof offcpu.pb.gz
 ### PMU Mode (`--pmu`)
 
 Prints to stdout:
-- **On-CPU time**: Time slice per context switch (min, max, mean, percentiles)
-- **Runqueue latency**: Time waiting for CPU after becoming runnable (min, max, mean, percentiles)
-- **Context switch reasons**: Breakdown of preempted (running), voluntary (sleep/mutex), and I/O wait (D state)
-- **Hardware counters**: Cycles, instructions, cache misses
-- **Derived metrics**: IPC (instructions per cycle), cache miss rate
+
+- **On-CPU Time**: Time slice per context switch (min, max, mean, percentiles)
+  - Measures how long a process runs on CPU before being switched out
+- **Runqueue Latency**: Time waiting for CPU after becoming runnable (min, max, mean, percentiles)
+  - Measures scheduling delay: time from `sched_wakeup` to actually running
+- **Context Switch Reasons**: Breakdown of why tasks were switched out
+  - **Preempted (running)**: Task was running and got preempted by scheduler
+  - **Voluntary (sleep/mutex)**: Task voluntarily yielded (sleep, mutex wait)
+  - **I/O Wait (D state)**: Task blocked on I/O (uninterruptible sleep)
+- **Hardware Counters**: Cycles, instructions, cache misses
+- **Derived Metrics**: IPC (instructions per cycle), cache miss rate
+
+Example output:
+```
+=== PMU Metrics (PID: 84228) ===
+Samples: 26358
+
+On-CPU Time (time slice per context switch):
+  Min:    0.003 ms
+  P50:    0.071 ms
+  P99:    9.183 ms
+
+Runqueue Latency (time waiting for CPU):
+  Min:    0.001 ms
+  P50:    0.012 ms
+  P99:    0.850 ms
+
+Context Switch Reasons:
+  Preempted (running):     45.2%  (11912 times)
+  Voluntary (sleep/mutex): 42.1%  (11095 times)
+  I/O Wait (D state):      12.7%  (3351 times)
+
+Hardware Counters:
+  IPC (Instr/Cycle):  2.342
+  Cache Misses/1K:    0.022
+```
+
+## Library Usage
+
+`perf-agent` can be used as a Go library via the `perfagent` package:
+
+```go
+package main
+
+import (
+    "context"
+    "log"
+    "time"
+    "perf-agent/perfagent"
+)
+
+func main() {
+    agent, err := perfagent.New(
+        perfagent.WithPID(12345),
+        perfagent.WithCPUProfile("profile.pb.gz"),
+        perfagent.WithPMU(),
+    )
+    if err != nil {
+        log.Fatal(err)
+    }
+    defer agent.Close()
+
+    ctx := context.Background()
+    agent.Start(ctx)
+    time.Sleep(10 * time.Second)
+    agent.Stop(ctx)
-    agent.Start(ctx)
-    time.Sleep(10 * time.Second)
-    agent.Stop(ctx)
+    if err := agent.Start(ctx); err != nil {
+        log.Fatal(err)
+    }
+    time.Sleep(10 * time.Second)
+    if err := agent.Stop(ctx); err != nil {
+        log.Fatal(err)
+    }
-    agent.Start(ctx)
-    time.Sleep(10 * time.Second)
-    agent.Stop(ctx)
+    if err := agent.Start(ctx); err != nil {
+        log.Fatal(err)
+    }
+    time.Sleep(10 * time.Second)
+    if err := agent.Stop(ctx); err != nil {
+        log.Fatal(err)
+    }
+}
+```
+
+### In-Memory Collection
+
+```go
+var buf bytes.Buffer
+agent, _ := perfagent.New(
+    perfagent.WithCPUProfileWriter(&buf), // gzip-compressed pprof
+)
+// After Stop(), buf contains ready-to-use .pb.gz data
+```
+
+### Custom Metrics Export
+
+```go
+agent, _ := perfagent.New(
+    perfagent.WithPMU(),
+    perfagent.WithMetricsExporter(&MyExporter{}),
+)
+```
+
+See [perfagent package documentation](perfagent/) for all available options.
 
 ## Building
 

diff --git a/cpu/cpu_usage_collector.go b/cpu/cpu_usage_collector.go
@@ -1,6 +1,7 @@
 package cpu
 
 import (
+	"context"
 	"errors"
 	"fmt"
 	"log"
@@ -9,6 +10,8 @@ import (
 
 	"github.com/HdrHistogram/hdrhistogram-go"
 	"github.com/cilium/ebpf/ringbuf"
+
+	"perf-agent/metrics"
 )
 
 type CPUUsageCollector struct {
@@ -263,7 +266,7 @@ func printSinglePIDMetrics(m *PidMetrics) {
 }
 
 // printAggregateMetrics prints aggregated metrics for system-wide mode
-func printAggregateMetrics(metrics map[uint32]*PidMetrics) {
+func printAggregateMetrics(metricsMap map[uint32]*PidMetrics) {
 	var totalSamples uint64
 	var totalCycles, totalInstructions, totalCacheMisses uint64
 	var totalPreempted, totalVoluntary, totalIOWait uint64
@@ -272,7 +275,7 @@ func printAggregateMetrics(metrics map[uint32]*PidMetrics) {
 	aggOnCPUHist := hdrhistogram.New(0, 1000000000000000, 3)
 	aggRunqHist := hdrhistogram.New(0, 1000000000000, 3)
 
-	for _, m := range metrics {
+	for _, m := range metricsMap {
 		totalSamples += m.SampleCount
 		totalCycles += m.TotalCycles
 		totalInstructions += m.TotalInstructions
@@ -287,7 +290,7 @@ func printAggregateMetrics(metrics map[uint32]*PidMetrics) {
 	}
 
 	fmt.Printf("\nPerformance counter stats for 'system wide':\n\n")
-	fmt.Printf("  Processes profiled:     %d\n", len(metrics))
+	fmt.Printf("  Processes profiled:     %d\n", len(metricsMap))
 	fmt.Printf("  Total samples:          %d\n", totalSamples)
 
 	// On-CPU time histogram stats
@@ -340,3 +343,74 @@ func printAggregateMetrics(metrics map[uint32]*PidMetrics) {
 		fmt.Printf("  Cache Misses/1K Instr:  %.2f\n", missRate)
 	}
 }
+
+// GetSnapshot returns a metrics snapshot for export.
+func (c *CPUUsageCollector) GetSnapshot(systemWide bool) *metrics.MetricsSnapshot {
+	snapshot := metrics.NewMetricsSnapshot(systemWide)
+
+	for pid, m := range c.metrics {
+		pm := &metrics.ProcessMetrics{
+			PID:         pid,
+			SampleCount: m.SampleCount,
+			OnCPUStats: metrics.LatencyStats{
+				Min:   m.OnCPUHist.Min(),
+				Max:   m.OnCPUHist.Max(),
+				P50:   m.OnCPUHist.ValueAtQuantile(50.0),
+				P95:   m.OnCPUHist.ValueAtQuantile(95.0),
+				P99:   m.OnCPUHist.ValueAtQuantile(99.0),
+				P999:  m.OnCPUHist.ValueAtQuantile(99.9),
+				Mean:  m.OnCPUHist.Mean(),
+				Count: m.OnCPUHist.TotalCount(),
+			},
+			RunqueueStats: metrics.LatencyStats{
+				Min:   m.RunqLatencyHist.Min(),
+				Max:   m.RunqLatencyHist.Max(),
+				P50:   m.RunqLatencyHist.ValueAtQuantile(50.0),
+				P95:   m.RunqLatencyHist.ValueAtQuantile(95.0),
+				P99:   m.RunqLatencyHist.ValueAtQuantile(99.0),
+				P999:  m.RunqLatencyHist.ValueAtQuantile(99.9),
+				Mean:  m.RunqLatencyHist.Mean(),
+				Count: m.RunqLatencyHist.TotalCount(),
+			},
+			ContextSwitches: metrics.ContextSwitchStats{
+				PreemptedCount: m.PreemptedCount,
+				VoluntaryCount: m.VoluntaryCount,
+				IOWaitCount:    m.IOWaitCount,
+			},
+		}
+
+		// Hardware counters
+		if m.TotalCycles > 0 || m.TotalInstructions > 0 {
+			pm.HardwareCounters = metrics.HardwareCounterStats{
+				Available:    true,
+				Cycles:       m.TotalCycles,
+				Instructions: m.TotalInstructions,
+				CacheMisses:  m.TotalCacheMisses,
+			}
+			if m.TotalCycles > 0 {
+				pm.HardwareCounters.IPC = float64(m.TotalInstructions) / float64(m.TotalCycles)
+			}
+			if m.TotalInstructions > 0 {
+				pm.HardwareCounters.MissRate = float64(m.TotalCacheMisses) / float64(m.TotalInstructions) * 1000
+			}
+		}
+
+		snapshot.AddProcess(pid, pm)
+	}
+
+	return snapshot
+}
+
+// ExportMetrics exports metrics using the provided exporters.
+func (c *CPUUsageCollector) ExportMetrics(ctx context.Context, systemWide bool, exporters ...metrics.Exporter) error {
+	snapshot := c.GetSnapshot(systemWide)
+
+	var lastErr error
+	for _, exp := range exporters {
+		if err := exp.Export(ctx, snapshot); err != nil {
+			lastErr = fmt.Errorf("exporter %s: %w", exp.Name(), err)
+		}
+	}
+
+	return lastErr
+}