github · pelikhan · Nov 13, 2025 · Nov 13, 2025 · Nov 13, 2025 · Nov 13, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -84,6 +84,36 @@ jobs:
         run: cd pkg/workflow/js && npm ci
       - name: Run tests
         run: cd pkg/workflow/js && npm test
+  bench:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+    concurrency:
+      group: ${{ github.workflow }}-${{ github.ref }}-bench
+      cancel-in-progress: true
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
+
+      - name: Set up Go
+        uses: actions/setup-go@v5
+        with:
+          go-version-file: go.mod
+          cache: true
+
+      - name: Verify dependencies
+        run: go mod verify
+
+      - name: Run benchmarks
+        run: make bench
+
+      - name: Save benchmark results
+        uses: actions/upload-artifact@v4
+        with:
+          name: benchmark-results
+          path: bench_results.txt
+          if-no-files-found: ignore
+
   lint:
     runs-on: ubuntu-latest
     permissions:

diff --git a/.gitignore b/.gitignore
@@ -51,6 +51,9 @@ coverage.html
 coverage/
 logs/
 
+# Benchmark results
+bench_results.txt
+
 node_modules/
 gh-aw-test/
 

diff --git a/Makefile b/Makefile
@@ -58,6 +58,19 @@ test-perf:
 	rm -f /tmp/gh-aw/test-output.log; \
 	exit $$EXIT_CODE
 
+# Run benchmarks for performance testing
+.PHONY: bench
+bench:
+	@echo "Running benchmarks..."
+	go test -bench=. -benchmem -benchtime=3x -run=^$$ ./pkg/... | tee bench_results.txt
+
+# Run benchmarks with comparison output
+.PHONY: bench-compare
+bench-compare:
+	@echo "Running benchmarks and saving results..."
+	go test -bench=. -benchmem -benchtime=100x -run=^$$ ./pkg/... | tee bench_results.txt
+	@echo "Benchmark results saved to bench_results.txt"
+
 # Test JavaScript files
 .PHONY: test-js
 test-js: build-js

diff --git a/TESTING.md b/TESTING.md
@@ -10,6 +10,60 @@ The testing framework implements **Phase 6 (Quality Assurance)** of the Go reimp
 
 ### 1. Unit Tests (`pkg/*/`)
 
+### 2. Benchmarks (`pkg/*/_benchmark_test.go`)
+
+Performance benchmarks measure the speed of critical operations. Run benchmarks to:
+- Detect performance regressions
+- Identify optimization opportunities
+- Track performance trends over time
+
+**Running Benchmarks:**
+```bash
+# Run all benchmarks with make (optimized for CI, runs in ~6 seconds)
+make bench
+
+# Run all benchmarks manually
+go test -bench=. -benchtime=3x -run=^$ ./pkg/...
+
+# Run benchmarks with more iterations for comparison
+make bench-compare
+
+# Run benchmarks for specific package
+go test -bench=. -benchtime=3x -run=^$ ./pkg/workflow/
+
+# Run specific benchmark
+go test -bench=BenchmarkCompileWorkflow -benchtime=3x -run=^$ ./pkg/workflow/
+
+# Run with custom iterations (default is 1 second per benchmark)
+go test -bench=. -benchtime=100x -run=^$ ./pkg/workflow/
+
+# Run with memory profiling
+go test -bench=. -benchmem -benchtime=3x -run=^$ ./pkg/...
+
+# Compare benchmark results over time
+go test -bench=. -benchtime=3x -run=^$ ./pkg/... > bench_baseline.txt
+# ... make changes ...
+go test -bench=. -benchtime=3x -run=^$ ./pkg/... > bench_new.txt
+benchstat bench_baseline.txt bench_new.txt
+```
+
+**Note**: Benchmarks use `-benchtime=3x` (3 iterations) for fast CI execution. For more accurate measurements, use `-benchtime=100x` or longer durations.
+
+**Benchmark Coverage:**
+- **Workflow Compilation**: Basic, with MCP, with imports, with validation, complex workflows
+- **Frontmatter Parsing**: Simple, complex, minimal, with arrays, schema validation
+- **Expression Validation**: Single expressions, complex expressions, full markdown validation, parsing
+- **Log Processing**: Claude, Copilot, Codex log parsing, aggregation, JSON metrics extraction
+- **MCP Configuration**: Playwright config, Docker args, expression extraction
+- **Tool Processing**: Simple and complex tool configurations, safe outputs, network permissions
+
+**Performance Baselines** (approximate, machine-dependent):
+- Workflow compilation: ~100μs - 2ms depending on complexity
+- Frontmatter parsing: ~10μs - 250μs depending on complexity
+- Expression validation: ~700ns - 10μs per expression
+- Log parsing: ~50μs - 1ms depending on log size
+- Schema validation: ~35μs - 130μs depending on complexity
+
 ### 3. Test Validation Framework (`test_validation.go`)
 
 Comprehensive validation system that ensures:
@@ -73,6 +127,7 @@ As the Go implementation develops:
 - CLI interface structure and stability
 - Basic workflow compilation interface
 - Error handling for malformed inputs
+- **Performance benchmarks** for critical operations (62+ benchmarks)
 
 ### 🔄 Interface Testing (Ready for Implementation)
 - CLI command execution (stubs tested)
@@ -81,7 +136,7 @@ As the Go implementation develops:
 
 ### 📋 Ready for Enhancement
 - Bash-Go output comparison (when compiler is complete)
-- Performance benchmarking
+- **Performance regression tracking** (baseline established)
 - Cross-platform compatibility testing
 - Real workflow execution testing
 

diff --git a/pkg/cli/logs_benchmark_test.go b/pkg/cli/logs_benchmark_test.go
@@ -0,0 +1,231 @@
+package cli
+
+import (
+	"testing"
+
+	"github.com/githubnext/gh-aw/pkg/workflow"
+)
+
+// Sample log content for benchmarking
+const (
+	sampleClaudeLog = `[{"type":"session_created","timestamp":"2024-01-15T10:00:00.000Z"}]
+[{"type":"message","timestamp":"2024-01-15T10:00:01.000Z","message":"Starting analysis"}]
+[{"type":"tool_use","timestamp":"2024-01-15T10:00:02.000Z","tool":"github.get_issue"}]
+[{"type":"tool_result","timestamp":"2024-01-15T10:00:03.000Z"}]
+[{"type":"usage","timestamp":"2024-01-15T10:00:04.000Z","input_tokens":1000,"output_tokens":500}]
+[{"type":"message","timestamp":"2024-01-15T10:00:05.000Z","message":"Analysis complete"}]
+[{"type":"result","timestamp":"2024-01-15T10:00:06.000Z","total_input_tokens":1000,"total_output_tokens":500,"cost":0.015}]`
+
+	sampleCopilotLog = `2024-01-15T10:00:00.123Z [INFO] Copilot started
+2024-01-15T10:00:01.456Z [INFO] Processing request
+2024-01-15T10:00:02.789Z [DEBUG] Tool call: github.get_issue
+2024-01-15T10:00:03.012Z [DEBUG] Tool result received
+2024-01-15T10:00:04.345Z [INFO] Token usage: 1500 total
+2024-01-15T10:00:05.678Z [ERROR] Minor issue detected
+2024-01-15T10:00:06.901Z [INFO] Request completed`
+
+	sampleCodexLog = `] tool github.search_issues(...)
+tool result: [{"id": 123, "title": "Issue 1"}]
+] exec ls -la in /tmp
+exec result: total 8
+] tool github.get_issue(...)
+tool result: {"id": 123, "body": "Issue content"}
+] success in 2.5s`
+
+	largeClaudeLog = sampleClaudeLog + "\n" + sampleClaudeLog + "\n" + sampleClaudeLog + "\n" + sampleClaudeLog + "\n" + sampleClaudeLog
+
+	largeCopilotLog = sampleCopilotLog + "\n" + sampleCopilotLog + "\n" + sampleCopilotLog + "\n" + sampleCopilotLog + "\n" + sampleCopilotLog
+)
+
+// BenchmarkParseClaudeLog benchmarks Claude log parsing
+func BenchmarkParseClaudeLog(b *testing.B) {
+	engine := &workflow.ClaudeEngine{}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_ = engine.ParseLogMetrics(sampleClaudeLog, false)
+	}
+}
+
+// BenchmarkParseClaudeLog_Large benchmarks parsing large Claude log file
+func BenchmarkParseClaudeLog_Large(b *testing.B) {
+	engine := &workflow.ClaudeEngine{}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_ = engine.ParseLogMetrics(largeClaudeLog, false)
+	}
+}
+
+// BenchmarkParseCopilotLog benchmarks Copilot log parsing
+func BenchmarkParseCopilotLog(b *testing.B) {
+	engine := &workflow.CopilotEngine{}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_ = engine.ParseLogMetrics(sampleCopilotLog, false)
+	}
+}
+
+// BenchmarkParseCopilotLog_Large benchmarks parsing large Copilot log file
+func BenchmarkParseCopilotLog_Large(b *testing.B) {
+	engine := &workflow.CopilotEngine{}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_ = engine.ParseLogMetrics(largeCopilotLog, false)
+	}
+}
+
+// BenchmarkParseCodexLog benchmarks Codex log parsing
+func BenchmarkParseCodexLog(b *testing.B) {
+	engine := &workflow.CodexEngine{}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_ = engine.ParseLogMetrics(sampleCodexLog, false)
+	}
+}
+
+// BenchmarkParseCodexLog_WithErrors benchmarks Codex log parsing with errors
+func BenchmarkParseCodexLog_WithErrors(b *testing.B) {
+	logWithErrors := sampleCodexLog + `
+] error: connection timeout
+] warning: retry attempt
+] error: max retries exceeded
+] tool github.get_repository(...)
+] success in 1.2s`
+
+	engine := &workflow.CodexEngine{}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_ = engine.ParseLogMetrics(logWithErrors, false)
+	}
+}
+
+// BenchmarkAggregateWorkflowStats benchmarks log aggregation across multiple runs
+func BenchmarkAggregateWorkflowStats(b *testing.B) {
+	// Create sample workflow runs
+	runs := []WorkflowRun{
+		{
+			DatabaseID:    12345,
+			WorkflowName:  "test-workflow-1",
+			Status:        "completed",
+			Conclusion:    "success",
+			TokenUsage:    1500,
+			EstimatedCost: 0.015,
+			Turns:         3,
+			ErrorCount:    0,
+			WarningCount:  1,
+		},
+		{
+			DatabaseID:    12346,
+			WorkflowName:  "test-workflow-2",
+			Status:        "completed",
+			Conclusion:    "failure",
+			TokenUsage:    2500,
+			EstimatedCost: 0.025,
+			Turns:         5,
+			ErrorCount:    2,
+			WarningCount:  3,
+		},
+		{
+			DatabaseID:    12347,
+			WorkflowName:  "test-workflow-1",
+			Status:        "completed",
+			Conclusion:    "success",
+			TokenUsage:    1800,
+			EstimatedCost: 0.018,
+			Turns:         4,
+			ErrorCount:    0,
+			WarningCount:  0,
+		},
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		// Simulate aggregation logic
+		totalTokens := 0
+		totalCost := 0.0
+		totalTurns := 0
+		totalErrors := 0
+		totalWarnings := 0
+
+		for _, run := range runs {
+			totalTokens += run.TokenUsage
+			totalCost += run.EstimatedCost
+			totalTurns += run.Turns
+			totalErrors += run.ErrorCount
+			totalWarnings += run.WarningCount
+		}
+
+		_ = totalTokens
+		_ = totalCost
+		_ = totalTurns
+		_ = totalErrors
+		_ = totalWarnings
+	}
+}
+
+// BenchmarkAggregateWorkflowStats_Large benchmarks aggregation with many runs
+func BenchmarkAggregateWorkflowStats_Large(b *testing.B) {
+	// Create 100 sample workflow runs
+	runs := make([]WorkflowRun, 100)
+	for i := 0; i < 100; i++ {
+		runs[i] = WorkflowRun{
+			DatabaseID:    int64(12345 + i),
+			WorkflowName:  "test-workflow",
+			Status:        "completed",
+			Conclusion:    "success",
+			TokenUsage:    1500 + i*10,
+			EstimatedCost: 0.015 + float64(i)*0.001,
+			Turns:         3 + i%5,
+			ErrorCount:    i % 3,
+			WarningCount:  i % 2,
+		}
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		totalTokens := 0
+		totalCost := 0.0
+		totalTurns := 0
+		totalErrors := 0
+		totalWarnings := 0
+
+		for _, run := range runs {
+			totalTokens += run.TokenUsage
+			totalCost += run.EstimatedCost
+			totalTurns += run.Turns
+			totalErrors += run.ErrorCount
+			totalWarnings += run.WarningCount
+		}
+
+		_ = totalTokens
+		_ = totalCost
+		_ = totalTurns
+		_ = totalErrors
+		_ = totalWarnings
-		totalTokens := 0
-		totalCost := 0.0
-		totalTurns := 0
-		totalErrors := 0
-		totalWarnings := 0
-
-		for _, run := range runs {
-			totalTokens += run.TokenUsage
-			totalCost += run.EstimatedCost
-			totalTurns += run.Turns
-			totalErrors += run.ErrorCount
-			totalWarnings += run.WarningCount
-		}
-
-		_ = totalTokens
-		_ = totalCost
-		_ = totalTurns
-		_ = totalErrors
-		_ = totalWarnings
+		agg := AggregateWorkflowStats(runs)
+		_ = agg
-		totalTokens := 0
-		totalCost := 0.0
-		totalTurns := 0
-		totalErrors := 0
-		totalWarnings := 0
-
-		for _, run := range runs {
-			totalTokens += run.TokenUsage
-			totalCost += run.EstimatedCost
-			totalTurns += run.Turns
-			totalErrors += run.ErrorCount
-			totalWarnings += run.WarningCount
-		}
-
-		_ = totalTokens
-		_ = totalCost
-		_ = totalTurns
-		_ = totalErrors
-		_ = totalWarnings
+		agg := AggregateWorkflowStats(runs)
+		_ = agg
+	}
+}
+
+// BenchmarkExtractJSONMetrics benchmarks JSON metrics extraction
+func BenchmarkExtractJSONMetrics(b *testing.B) {
+	jsonLine := `{"type":"usage","input_tokens":1000,"output_tokens":500,"cost":0.015}`
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_ = workflow.ExtractJSONMetrics(jsonLine, false)
+	}
+}
+
+// BenchmarkExtractJSONMetrics_Complex benchmarks complex JSON metrics extraction
+func BenchmarkExtractJSONMetrics_Complex(b *testing.B) {
+	jsonLine := `{"type":"result","total_input_tokens":5000,"total_output_tokens":2500,"cost":0.075,"metadata":{"tool_calls":["github.get_issue","github.add_comment"],"duration_ms":1500}}`
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_ = workflow.ExtractJSONMetrics(jsonLine, false)
+	}
+}