gitautoai · hiroshinishio · Apr 20, 2026 · Apr 19, 2026
@@ -1,6 +1,6 @@
 [project]
 name = "GitAuto"
-version = "1.24.3"
+version = "1.24.5"
 requires-python = ">=3.14"
 dependencies = [
     "annotated-doc==0.0.4",

@@ -1,8 +1,11 @@
 # pyright: reportArgumentType=false
+from pathlib import Path
 from unittest.mock import MagicMock, patch
 
+import pytest
+
 from constants.claude import MAX_OUTPUT_TOKENS
-from constants.models import ClaudeModelId
+from constants.models import ClaudeModelId, GoogleModelId
 from services.claude.evaluate_quality_checks import evaluate_quality_checks
 
 
@@ -61,3 +64,122 @@ def test_haiku_45_passes_model_and_max_tokens(mock_claude):
     kwargs = _mock_claude_call(mock_claude, ClaudeModelId.HAIKU_4_5)
     assert kwargs["model"] == "claude-haiku-4-5"
     assert kwargs["max_tokens"] == MAX_OUTPUT_TOKENS[ClaudeModelId.HAIKU_4_5]
+
+
+@pytest.mark.integration
+def test_gemma_returns_case_coverage_for_real_file_pair():
+    """Real Gemma call: verify case_coverage category is graded for a real source+test pair."""
+    repo_root = Path(__file__).resolve().parents[2]
+    src_path = "services/claude/forget_messages.py"
+    test_path = "services/claude/test_forget_messages.py"
+    src_content = (repo_root / src_path).read_text()
+    test_content = (repo_root / test_path).read_text()
+
+    result = evaluate_quality_checks(
+        source_content=src_content,
+        source_path=src_path,
+        test_files=[(test_path, test_content)],
+        model=GoogleModelId.GEMMA_4_31B,
+    )
+
+    assert result is not None
+    # Access directly — KeyError if missing
+    case_cov = result["case_coverage"]
+    assert set(case_cov.keys()) == {
+        "dimension_enumeration",
+        "combinatorial_matrix",
+        "explicit_expected_per_cell",
+    }
+    for check_name, check_data in case_cov.items():
+        assert check_data["status"] in {
+            "pass",
+            "fail",
+            "na",
+        }, f"{check_name} has invalid status {check_data.get('status')!r}"
+
+
+# Shared source: a small pure function with 3 independent input dimensions
+# (sign of amount, currency, customer_tier) -> 2*3*3 = 18 business cases.
+_DISCOUNT_SRC = '''
+def apply_discount(amount: float, currency: str, customer_tier: str) -> float:
+    """Return discounted amount. Premium gets 20%, gold 10%, standard 0%.
+    Non-USD gets extra 5% off. Negative amounts return 0."""
+    if amount < 0:
+        return 0.0
+    discount = 0.0
+    if customer_tier == "premium":
+        discount = 0.20
+    elif customer_tier == "gold":
+        discount = 0.10
+    if currency != "USD":
+        discount += 0.05
+    return amount * (1 - discount)
+'''
+
+_WEAK_TEST = """
+from src.discount import apply_discount
+
+def test_premium_usd():
+    assert apply_discount(100.0, "USD", "premium") == 80.0
+"""
+
+_STRONG_TEST = """
+import pytest
+from src.discount import apply_discount
+
+# Matrix: sign x currency x tier -> expected (derived from business rules, not code)
+@pytest.mark.parametrize("amount,currency,tier,expected", [
+    # Negative amount early-returns 0 regardless of other dims (pruned)
+    (-1.0, "USD", "standard", 0.0),
+    # Positive amount, full matrix of currency x tier
+    (100.0, "USD", "standard", 100.0),
+    (100.0, "USD", "gold", 90.0),
+    (100.0, "USD", "premium", 80.0),
+    (100.0, "EUR", "standard", 95.0),
+    (100.0, "EUR", "gold", 85.0),
+    (100.0, "EUR", "premium", 75.0),
+    (100.0, "JPY", "standard", 95.0),
+    (100.0, "JPY", "gold", 85.0),
+    (100.0, "JPY", "premium", 75.0),
+])
+def test_discount_matrix(amount, currency, tier, expected):
+    assert apply_discount(amount, currency, tier) == expected
+"""
+
+
+@pytest.mark.integration
+def test_gemma_discriminates_weak_vs_strong_case_coverage():
+    """Gemma should grade a 1-case test worse than a full matrix on case_coverage."""
+    weak_result = evaluate_quality_checks(
+        source_content=_DISCOUNT_SRC,
+        source_path="src/discount.py",
+        test_files=[("tests/test_discount.py", _WEAK_TEST)],
+        model=GoogleModelId.GEMMA_4_31B,
+    )
+    strong_result = evaluate_quality_checks(
+        source_content=_DISCOUNT_SRC,
+        source_path="src/discount.py",
+        test_files=[("tests/test_discount.py", _STRONG_TEST)],
+        model=GoogleModelId.GEMMA_4_31B,
+    )
+
+    assert weak_result is not None and strong_result is not None
+    weak_cov = weak_result["case_coverage"]
+    strong_cov = strong_result["case_coverage"]
+
+    weak_fails = sum(1 for c in weak_cov.values() if c["status"] == "fail")
+    strong_fails = sum(1 for c in strong_cov.values() if c["status"] == "fail")
+
+    print("\nWEAK case_coverage:")
+    for name, data in weak_cov.items():
+        print(f"  {name}: {data['status']} — {data.get('reason', '')}")
+    print("\nSTRONG case_coverage:")
+    for name, data in strong_cov.items():
+        print(f"  {name}: {data['status']} — {data.get('reason', '')}")
+
+    # The weak test (1 case for 18-cell matrix) must fail more checks than the strong test.
+    assert (
+        weak_fails > strong_fails
+    ), f"Gemma did not discriminate: weak_fails={weak_fails}, strong_fails={strong_fails}"
+    # The weak test must fail combinatorial_matrix specifically — it only has 1 case.
+    assert weak_cov["combinatorial_matrix"]["status"] == "fail"
@@ -1,4 +1,9 @@
 QUALITY_CHECKLIST: dict[str, list[str]] = {
+    "case_coverage": [
+        "dimension_enumeration",
+        "combinatorial_matrix",
+        "explicit_expected_per_cell",
+    ],
     "integration": [
         "db_operations_use_real_test_db",
         "api_calls_tested_end_to_end",

@@ -2,34 +2,76 @@
 from utils.quality_checks.get_checklist_hash import get_checklist_hash
 
 
-def test_checklist_has_all_categories():
-    expected_categories = {
-        "integration",
-        "adversarial",
-        "security",
-        "performance",
-        "memory",
-        "error_handling",
-        "accessibility",
-        "business_logic",
-        "seo",
+def test_checklist_matches_expected():
+    assert QUALITY_CHECKLIST == {
+        "case_coverage": [
+            "dimension_enumeration",
+            "combinatorial_matrix",
+            "explicit_expected_per_cell",
+        ],
+        "integration": [
+            "db_operations_use_real_test_db",
+            "api_calls_tested_end_to_end",
+            "env_var_guards_for_secrets",
+        ],
+        "business_logic": [
+            "domain_rules",
+            "state_transitions",
+            "calculation_accuracy",
+            "data_integrity",
+            "workflow_correctness",
+        ],
+        "adversarial": [
+            "null_undefined_inputs",
+            "empty_strings_arrays",
+            "boundary_values",
+            "type_coercion",
+            "large_inputs",
+            "race_conditions",
+            "unicode_special_chars",
+        ],
+        "security": [
+            "xss",
+            "sql_injection",
+            "command_injection",
+            "code_injection",
+            "csrf",
+            "auth_bypass",
+            "sensitive_data_exposure",
+            "untrusted_input_sanitization",
+            "open_redirects",
+            "path_traversal",
+        ],
+        "performance": [
+            "quadratic_algorithms",
+            "heavy_sync_operations",
+            "n_plus_1_queries",
+            "large_imports",
+            "redundant_computation",
+        ],
+        "memory": [
+            "event_listener_cleanup",
+            "subscription_timer_cleanup",
+            "circular_references",
+            "closure_retention",
+        ],
+        "error_handling": [
+            "graceful_degradation",
+            "user_error_messages",
+        ],
+        "accessibility": [
+            "aria_attributes",
+            "keyboard_navigation",
+            "screen_reader",
+            "focus_management",
+        ],
+        "seo": [
+            "meta_tags",
+            "semantic_html",
+            "heading_hierarchy",
+            "alt_text",
+        ],
     }
-    assert set(QUALITY_CHECKLIST.keys()) == expected_categories
-
-
-def test_each_category_has_checks():
-    for category, checks in QUALITY_CHECKLIST.items():
-        assert len(checks) > 0, f"Category '{category}' has no checks"
-        for check in checks:
-            assert isinstance(
-                check, str
-            ), f"Check '{check}' in '{category}' is not a string"
-            assert len(check) > 0, f"Empty check name in '{category}'"
-
-
-def test_no_duplicate_checks_within_categories():
-    for category, checks in QUALITY_CHECKLIST.items():
-        assert len(checks) == len(set(checks)), f"Duplicate checks in '{category}'"
 
 
 def test_get_checklist_hash_returns_hex_string():