Skip to content

Commit 8a8c04e

Browse files
test(dspy): add comprehensive unit tests for GptmeReasoningProgram (#870)
* fix(dspy): register metadata for complexity test tasks Fixes #866 The create_medium_complexity_task() and create_complex_task() functions were returning raw EvalSpec dicts without registering their metadata in the _task_metadata registry. This caused test_task_structure to fail because get_task_metadata() returned empty dicts for these tasks. Changes: - Convert both functions to use TaskBuilder pattern - Call build_with_metadata() instead of returning raw dicts - Register metadata in _task_metadata before returning - Add appropriate focus_areas to metadata: - medium task: system_design, testing, complexity_medium - complex task: performance_optimization, profiling, complexity_high This ensures all tasks have proper metadata with focus_areas, allowing tests and the focus area filtering system to work correctly. * fix(tests): add module-level skip for test_dspy_hybrid when dspy not available The test_dspy_hybrid.py file was causing collection errors in CI configurations that don't include the eval extras (empty extras string). Added the same skip pattern used in test_dspy_basic.py to gracefully skip the entire test module when dspy is not installed. Fixes the ERROR in CI: ModuleNotFoundError: No module named 'dspy' * test(dspy): add comprehensive unit tests for GptmeReasoningProgram - Add tests for all five signature classes (TaskAnalysis, Planning, Execution, Monitoring, Recovery) - Add integration tests for multi-stage flow - Add error handling and recovery logic tests - Add tests for both successful and failure paths - Add edge case tests (empty inputs, long inputs, retry limits) Addresses #789 * fix: correct import sorting in reasoning program tests * fix: add required 'run' field to EvalSpec fixture * fix: use dict instead of list for files in EvalSpec fixture * fix(tests): add skip decorator to test_execute_with_recovery_max_retries This test requires DSPy LM configuration and should only run with --eval flag, consistent with other integration tests in the file.
1 parent b888abb commit 8a8c04e

File tree

1 file changed

+332
-0
lines changed

1 file changed

+332
-0
lines changed
Lines changed: 332 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,332 @@
1+
"""
2+
Unit tests for the DSPy multi-stage reasoning program.
3+
4+
These tests verify the reasoning program's signature classes, multi-stage flow,
5+
error handling, and recovery logic.
6+
"""
7+
8+
import pytest
9+
10+
# Check if DSPy is available and handle import errors gracefully
11+
try:
12+
from gptme.eval.dspy import _has_dspy # fmt: skip
13+
14+
if not _has_dspy():
15+
pytest.skip("DSPy not available", allow_module_level=True)
16+
except (ImportError, ModuleNotFoundError):
17+
pytest.skip("DSPy module not available", allow_module_level=True)
18+
19+
# Try to import reasoning program components
20+
try:
21+
import dspy
22+
23+
from gptme.eval.dspy.reasoning_program import (
24+
ExecutionSignature,
25+
GptmeReasoningProgram,
26+
MonitoringSignature,
27+
PlanningSignature,
28+
RecoverySignature,
29+
TaskAnalysisSignature,
30+
create_reasoning_program,
31+
)
32+
from gptme.eval.types import EvalSpec
33+
except (ImportError, AttributeError) as e:
34+
pytest.skip(f"Reasoning program imports failed: {e}", allow_module_level=True)
35+
36+
DEFAULT_MODEL = "anthropic/claude-3-5-haiku-20241120"
37+
38+
39+
# Fixtures
40+
@pytest.fixture
41+
def eval_spec():
42+
"""Create a basic EvalSpec for testing."""
43+
return EvalSpec(
44+
name="test_task",
45+
prompt="Test task prompt",
46+
files={},
47+
run="echo 'test'",
48+
expect={},
49+
)
50+
51+
52+
@pytest.fixture
53+
def reasoning_program():
54+
"""Create a reasoning program instance for testing."""
55+
return create_reasoning_program()
56+
57+
58+
# Test Signature Classes
59+
def test_task_analysis_signature():
60+
"""Test TaskAnalysisSignature structure."""
61+
# Verify signature exists and has correct fields
62+
assert hasattr(TaskAnalysisSignature, "__doc__")
63+
assert "task_description" in TaskAnalysisSignature.input_fields
64+
assert "context" in TaskAnalysisSignature.input_fields
65+
assert "system_capabilities" in TaskAnalysisSignature.input_fields
66+
assert "task_type" in TaskAnalysisSignature.output_fields
67+
assert "key_requirements" in TaskAnalysisSignature.output_fields
68+
assert "constraints" in TaskAnalysisSignature.output_fields
69+
assert "approach_strategy" in TaskAnalysisSignature.output_fields
70+
71+
72+
def test_planning_signature():
73+
"""Test PlanningSignature structure."""
74+
assert hasattr(PlanningSignature, "__doc__")
75+
assert "task_analysis" in PlanningSignature.input_fields
76+
assert "available_tools" in PlanningSignature.input_fields
77+
assert "execution_steps" in PlanningSignature.output_fields
78+
assert "dependencies" in PlanningSignature.output_fields
79+
assert "success_criteria" in PlanningSignature.output_fields
80+
81+
82+
def test_execution_signature():
83+
"""Test ExecutionSignature structure."""
84+
assert hasattr(ExecutionSignature, "__doc__")
85+
assert "step_description" in ExecutionSignature.input_fields
86+
assert "current_state" in ExecutionSignature.input_fields
87+
assert "available_tools" in ExecutionSignature.input_fields
88+
assert "tool_selection" in ExecutionSignature.output_fields
89+
assert "tool_invocation" in ExecutionSignature.output_fields
90+
assert "expected_outcome" in ExecutionSignature.output_fields
91+
92+
93+
def test_monitoring_signature():
94+
"""Test MonitoringSignature structure."""
95+
assert hasattr(MonitoringSignature, "__doc__")
96+
assert "step_description" in MonitoringSignature.input_fields
97+
assert "execution_result" in MonitoringSignature.input_fields
98+
assert "expected_outcome" in MonitoringSignature.input_fields
99+
assert "success_criteria" in MonitoringSignature.input_fields
100+
assert "status" in MonitoringSignature.output_fields
101+
assert "progress_assessment" in MonitoringSignature.output_fields
102+
assert "issues_detected" in MonitoringSignature.output_fields
103+
assert "next_action" in MonitoringSignature.output_fields
104+
105+
106+
def test_recovery_signature():
107+
"""Test RecoverySignature structure."""
108+
assert hasattr(RecoverySignature, "__doc__")
109+
assert "error_description" in RecoverySignature.input_fields
110+
assert "execution_context" in RecoverySignature.input_fields
111+
assert "previous_attempts" in RecoverySignature.input_fields
112+
assert "error_analysis" in RecoverySignature.output_fields
113+
assert "recovery_strategy" in RecoverySignature.output_fields
114+
assert "alternative_approach" in RecoverySignature.output_fields
115+
assert "preventive_measures" in RecoverySignature.output_fields
116+
117+
118+
# Test GptmeReasoningProgram Class
119+
def test_reasoning_program_initialization():
120+
"""Test reasoning program initialization."""
121+
program = GptmeReasoningProgram()
122+
assert program.base_prompt == "You are a helpful AI assistant."
123+
assert hasattr(program, "analyze")
124+
assert hasattr(program, "plan")
125+
assert hasattr(program, "execute")
126+
assert hasattr(program, "monitor")
127+
assert hasattr(program, "recover")
128+
129+
# Test custom base prompt
130+
custom_program = GptmeReasoningProgram(base_prompt="Custom prompt")
131+
assert custom_program.base_prompt == "Custom prompt"
132+
133+
134+
def test_reasoning_program_modules():
135+
"""Test that reasoning program has all required modules."""
136+
program = create_reasoning_program()
137+
assert isinstance(program.analyze, dspy.ChainOfThought)
138+
assert isinstance(program.plan, dspy.ChainOfThought)
139+
assert isinstance(program.execute, dspy.ChainOfThought)
140+
assert isinstance(program.monitor, dspy.ChainOfThought)
141+
assert isinstance(program.recover, dspy.ChainOfThought)
142+
143+
144+
@pytest.mark.skip(
145+
reason="Requires LLM API access - use for integration testing with --eval flag"
146+
)
147+
def test_reasoning_program_forward_success(reasoning_program, eval_spec):
148+
"""Test successful execution through all stages."""
149+
result = reasoning_program.forward(
150+
task_description="Write a simple hello world script",
151+
context="Empty directory",
152+
eval_spec=eval_spec,
153+
available_tools="shell, python, save",
154+
)
155+
156+
# Verify prediction structure
157+
assert hasattr(result, "response")
158+
assert hasattr(result, "analysis")
159+
assert hasattr(result, "plan")
160+
assert hasattr(result, "execution")
161+
assert hasattr(result, "monitoring")
162+
assert result.eval_spec == eval_spec
163+
164+
# Verify response contains expected sections
165+
response = result.response
166+
assert "# Task Analysis" in response
167+
assert "# Execution Plan" in response
168+
assert "# Execution" in response
169+
assert "# Monitoring" in response
170+
171+
172+
def test_reasoning_program_forward_error_handling(reasoning_program, eval_spec):
173+
"""Test error handling in forward method."""
174+
# Test with invalid inputs that should trigger error handling
175+
result = reasoning_program.forward(
176+
task_description=None, # Invalid input
177+
context="",
178+
eval_spec=eval_spec,
179+
)
180+
181+
# Should return error prediction
182+
assert hasattr(result, "response")
183+
assert "Error in reasoning program:" in result.response or hasattr(result, "error")
184+
185+
186+
@pytest.mark.skip(
187+
reason="Requires LLM API access - use for integration testing with --eval flag"
188+
)
189+
def test_reasoning_program_execute_with_recovery_success(reasoning_program):
190+
"""Test execute_with_recovery with successful execution."""
191+
execution, success = reasoning_program.execute_with_recovery(
192+
step_description="Print hello world",
193+
current_state="Initial state",
194+
available_tools="python",
195+
max_retries=3,
196+
)
197+
198+
assert success is True
199+
assert execution is not None
200+
201+
202+
@pytest.mark.skip(
203+
reason="Requires LLM API access - use for integration testing with --eval flag"
204+
)
205+
def test_reasoning_program_execute_with_recovery_failure(reasoning_program):
206+
"""Test execute_with_recovery with failure and recovery attempts."""
207+
# Simulate a failing step
208+
execution, success = reasoning_program.execute_with_recovery(
209+
step_description="Do something impossible",
210+
current_state="Current state",
211+
available_tools="none",
212+
max_retries=2,
213+
)
214+
215+
# Should attempt recovery but eventually fail
216+
# Note: Actual behavior depends on LLM responses
217+
assert execution is not None
218+
219+
220+
def test_create_reasoning_program_factory():
221+
"""Test the factory function creates a valid program."""
222+
program = create_reasoning_program()
223+
assert isinstance(program, GptmeReasoningProgram)
224+
assert hasattr(program, "forward")
225+
assert hasattr(program, "execute_with_recovery")
226+
227+
228+
# Integration Tests
229+
@pytest.mark.skip(
230+
reason="Requires LLM API access - use for integration testing with --eval flag"
231+
)
232+
def test_multi_stage_flow_integration(reasoning_program, eval_spec):
233+
"""Test complete multi-stage flow from analysis to monitoring."""
234+
# Configure LLM
235+
dspy.configure(lm=dspy.LM(model=DEFAULT_MODEL))
236+
237+
result = reasoning_program.forward(
238+
task_description="Create a Python script that calculates factorial",
239+
context="Empty directory with Python available",
240+
eval_spec=eval_spec,
241+
available_tools="python, save, shell",
242+
)
243+
244+
# Verify all stages produced outputs
245+
assert hasattr(result, "analysis")
246+
assert hasattr(result, "plan")
247+
assert hasattr(result, "execution")
248+
assert hasattr(result, "monitoring")
249+
250+
# Verify analysis contains expected fields
251+
analysis = result.analysis
252+
assert hasattr(analysis, "task_type") or "task_type" in str(analysis)
253+
assert hasattr(analysis, "approach_strategy") or "approach_strategy" in str(
254+
analysis
255+
)
256+
257+
# Verify plan contains steps
258+
plan = result.plan
259+
assert hasattr(plan, "execution_steps") or "execution_steps" in str(plan)
260+
261+
# Verify execution contains tool selection
262+
execution = result.execution
263+
assert hasattr(execution, "tool_selection") or "tool_selection" in str(execution)
264+
265+
# Verify monitoring contains status
266+
monitoring = result.monitoring
267+
assert hasattr(monitoring, "status") or "status" in str(monitoring)
268+
269+
270+
@pytest.mark.skip(
271+
reason="Requires LLM API access - use for integration testing with --eval flag"
272+
)
273+
def test_recovery_flow_integration(reasoning_program):
274+
"""Test recovery flow when execution fails."""
275+
dspy.configure(lm=dspy.LM(model=DEFAULT_MODEL))
276+
277+
# Execute with recovery on a deliberately difficult task
278+
execution, success = reasoning_program.execute_with_recovery(
279+
step_description="Access non-existent file",
280+
current_state="Working directory",
281+
available_tools="shell",
282+
max_retries=2,
283+
)
284+
285+
# Should attempt recovery
286+
assert execution is not None
287+
# Success may vary based on LLM's ability to recover
288+
289+
290+
# Edge Cases and Error Handling
291+
def test_reasoning_program_empty_inputs(reasoning_program, eval_spec):
292+
"""Test handling of empty inputs."""
293+
result = reasoning_program.forward(
294+
task_description="",
295+
context="",
296+
eval_spec=eval_spec,
297+
available_tools="",
298+
)
299+
300+
# Should handle empty inputs gracefully
301+
assert result is not None
302+
assert hasattr(result, "response")
303+
304+
305+
def test_reasoning_program_very_long_inputs(reasoning_program, eval_spec):
306+
"""Test handling of very long inputs."""
307+
long_description = "Do something " * 1000 # Very long description
308+
result = reasoning_program.forward(
309+
task_description=long_description,
310+
context="context",
311+
eval_spec=eval_spec,
312+
)
313+
314+
# Should handle long inputs without crashing
315+
assert result is not None
316+
317+
318+
@pytest.mark.skip(
319+
reason="integration test: requires DSPy LM configuration, run with --eval"
320+
)
321+
def test_execute_with_recovery_max_retries(reasoning_program):
322+
"""Test that execute_with_recovery respects max_retries."""
323+
# Test with 0 retries - should only try once
324+
execution, success = reasoning_program.execute_with_recovery(
325+
step_description="test step",
326+
current_state="state",
327+
available_tools="tools",
328+
max_retries=1,
329+
)
330+
331+
# Should complete without error even with limited retries
332+
assert execution is not None

0 commit comments

Comments
 (0)