diff --git a/pyproject.toml b/pyproject.toml index 583a121e2..dd4a43427 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "GitAuto" -version = "1.24.3" +version = "1.24.5" requires-python = ">=3.14" dependencies = [ "annotated-doc==0.0.4", diff --git a/services/claude/test_evaluate_quality_checks.py b/services/claude/test_evaluate_quality_checks.py index e18736935..2ff0cc03c 100644 --- a/services/claude/test_evaluate_quality_checks.py +++ b/services/claude/test_evaluate_quality_checks.py @@ -1,8 +1,11 @@ # pyright: reportArgumentType=false +from pathlib import Path from unittest.mock import MagicMock, patch +import pytest + from constants.claude import MAX_OUTPUT_TOKENS -from constants.models import ClaudeModelId +from constants.models import ClaudeModelId, GoogleModelId from services.claude.evaluate_quality_checks import evaluate_quality_checks @@ -61,3 +64,122 @@ def test_haiku_45_passes_model_and_max_tokens(mock_claude): kwargs = _mock_claude_call(mock_claude, ClaudeModelId.HAIKU_4_5) assert kwargs["model"] == "claude-haiku-4-5" assert kwargs["max_tokens"] == MAX_OUTPUT_TOKENS[ClaudeModelId.HAIKU_4_5] + + +@pytest.mark.integration +def test_gemma_returns_case_coverage_for_real_file_pair(): + """Real Gemma call: verify case_coverage category is graded for a real source+test pair.""" + repo_root = Path(__file__).resolve().parents[2] + src_path = "services/claude/forget_messages.py" + test_path = "services/claude/test_forget_messages.py" + src_content = (repo_root / src_path).read_text() + test_content = (repo_root / test_path).read_text() + + result = evaluate_quality_checks( + source_content=src_content, + source_path=src_path, + test_files=[(test_path, test_content)], + model=GoogleModelId.GEMMA_4_31B, + ) + + assert result is not None + # Access directly — KeyError if missing + case_cov = result["case_coverage"] + assert set(case_cov.keys()) == { + "dimension_enumeration", + "combinatorial_matrix", + "explicit_expected_per_cell", + } + for check_name, check_data in case_cov.items(): + assert check_data["status"] in { + "pass", + "fail", + "na", + }, f"{check_name} has invalid status {check_data.get('status')!r}" + + +# Shared source: a small pure function with 3 independent input dimensions +# (sign of amount, currency, customer_tier) -> 2*3*3 = 18 business cases. +_DISCOUNT_SRC = ''' +def apply_discount(amount: float, currency: str, customer_tier: str) -> float: + """Return discounted amount. Premium gets 20%, gold 10%, standard 0%. + Non-USD gets extra 5% off. Negative amounts return 0.""" + if amount < 0: + return 0.0 + discount = 0.0 + if customer_tier == "premium": + discount = 0.20 + elif customer_tier == "gold": + discount = 0.10 + if currency != "USD": + discount += 0.05 + return amount * (1 - discount) +''' + +_WEAK_TEST = """ +from src.discount import apply_discount + +def test_premium_usd(): + assert apply_discount(100.0, "USD", "premium") == 80.0 +""" + +_STRONG_TEST = """ +import pytest +from src.discount import apply_discount + +# Matrix: sign x currency x tier -> expected (derived from business rules, not code) +@pytest.mark.parametrize("amount,currency,tier,expected", [ + # Negative amount early-returns 0 regardless of other dims (pruned) + (-1.0, "USD", "standard", 0.0), + # Positive amount, full matrix of currency x tier + (100.0, "USD", "standard", 100.0), + (100.0, "USD", "gold", 90.0), + (100.0, "USD", "premium", 80.0), + (100.0, "EUR", "standard", 95.0), + (100.0, "EUR", "gold", 85.0), + (100.0, "EUR", "premium", 75.0), + (100.0, "JPY", "standard", 95.0), + (100.0, "JPY", "gold", 85.0), + (100.0, "JPY", "premium", 75.0), +]) +def test_discount_matrix(amount, currency, tier, expected): + assert apply_discount(amount, currency, tier) == expected +""" + + +@pytest.mark.integration +def test_gemma_discriminates_weak_vs_strong_case_coverage(): + """Gemma should grade a 1-case test worse than a full matrix on case_coverage.""" + weak_result = evaluate_quality_checks( + source_content=_DISCOUNT_SRC, + source_path="src/discount.py", + test_files=[("tests/test_discount.py", _WEAK_TEST)], + model=GoogleModelId.GEMMA_4_31B, + ) + strong_result = evaluate_quality_checks( + source_content=_DISCOUNT_SRC, + source_path="src/discount.py", + test_files=[("tests/test_discount.py", _STRONG_TEST)], + model=GoogleModelId.GEMMA_4_31B, + ) + + assert weak_result is not None and strong_result is not None + weak_cov = weak_result["case_coverage"] + strong_cov = strong_result["case_coverage"] + + weak_fails = sum(1 for c in weak_cov.values() if c["status"] == "fail") + strong_fails = sum(1 for c in strong_cov.values() if c["status"] == "fail") + + print("\nWEAK case_coverage:") + for name, data in weak_cov.items(): + print(f" {name}: {data['status']} — {data.get('reason', '')}") + print("\nSTRONG case_coverage:") + for name, data in strong_cov.items(): + print(f" {name}: {data['status']} — {data.get('reason', '')}") + + # The weak test (1 case for 18-cell matrix) must fail more checks than the strong test. + assert ( + weak_fails > strong_fails + ), f"Gemma did not discriminate: weak_fails={weak_fails}, strong_fails={strong_fails}" + # The weak test must fail combinatorial_matrix specifically — it only has 1 case. + assert weak_cov["combinatorial_matrix"]["status"] == "fail" diff --git a/utils/quality_checks/checklist.py b/utils/quality_checks/checklist.py index 1f8e714e3..960722275 100644 --- a/utils/quality_checks/checklist.py +++ b/utils/quality_checks/checklist.py @@ -1,4 +1,9 @@ QUALITY_CHECKLIST: dict[str, list[str]] = { + "case_coverage": [ + "dimension_enumeration", + "combinatorial_matrix", + "explicit_expected_per_cell", + ], "integration": [ "db_operations_use_real_test_db", "api_calls_tested_end_to_end", diff --git a/utils/quality_checks/test_checklist.py b/utils/quality_checks/test_checklist.py index d3edafb29..bd63a99bf 100644 --- a/utils/quality_checks/test_checklist.py +++ b/utils/quality_checks/test_checklist.py @@ -2,34 +2,76 @@ from utils.quality_checks.get_checklist_hash import get_checklist_hash -def test_checklist_has_all_categories(): - expected_categories = { - "integration", - "adversarial", - "security", - "performance", - "memory", - "error_handling", - "accessibility", - "business_logic", - "seo", +def test_checklist_matches_expected(): + assert QUALITY_CHECKLIST == { + "case_coverage": [ + "dimension_enumeration", + "combinatorial_matrix", + "explicit_expected_per_cell", + ], + "integration": [ + "db_operations_use_real_test_db", + "api_calls_tested_end_to_end", + "env_var_guards_for_secrets", + ], + "business_logic": [ + "domain_rules", + "state_transitions", + "calculation_accuracy", + "data_integrity", + "workflow_correctness", + ], + "adversarial": [ + "null_undefined_inputs", + "empty_strings_arrays", + "boundary_values", + "type_coercion", + "large_inputs", + "race_conditions", + "unicode_special_chars", + ], + "security": [ + "xss", + "sql_injection", + "command_injection", + "code_injection", + "csrf", + "auth_bypass", + "sensitive_data_exposure", + "untrusted_input_sanitization", + "open_redirects", + "path_traversal", + ], + "performance": [ + "quadratic_algorithms", + "heavy_sync_operations", + "n_plus_1_queries", + "large_imports", + "redundant_computation", + ], + "memory": [ + "event_listener_cleanup", + "subscription_timer_cleanup", + "circular_references", + "closure_retention", + ], + "error_handling": [ + "graceful_degradation", + "user_error_messages", + ], + "accessibility": [ + "aria_attributes", + "keyboard_navigation", + "screen_reader", + "focus_management", + ], + "seo": [ + "meta_tags", + "semantic_html", + "heading_hierarchy", + "alt_text", + ], } - assert set(QUALITY_CHECKLIST.keys()) == expected_categories - - -def test_each_category_has_checks(): - for category, checks in QUALITY_CHECKLIST.items(): - assert len(checks) > 0, f"Category '{category}' has no checks" - for check in checks: - assert isinstance( - check, str - ), f"Check '{check}' in '{category}' is not a string" - assert len(check) > 0, f"Empty check name in '{category}'" - - -def test_no_duplicate_checks_within_categories(): - for category, checks in QUALITY_CHECKLIST.items(): - assert len(checks) == len(set(checks)), f"Duplicate checks in '{category}'" def test_get_checklist_hash_returns_hex_string(): diff --git a/uv.lock b/uv.lock index ba81bd557..743ff26c0 100644 --- a/uv.lock +++ b/uv.lock @@ -596,7 +596,7 @@ wheels = [ [[package]] name = "gitauto" -version = "1.24.3" +version = "1.24.5" source = { virtual = "." } dependencies = [ { name = "annotated-doc" },