Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "GitAuto"
version = "1.24.3"
version = "1.24.5"
requires-python = ">=3.14"
dependencies = [
"annotated-doc==0.0.4",
Expand Down
124 changes: 123 additions & 1 deletion services/claude/test_evaluate_quality_checks.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
# pyright: reportArgumentType=false
from pathlib import Path
from unittest.mock import MagicMock, patch

import pytest

from constants.claude import MAX_OUTPUT_TOKENS
from constants.models import ClaudeModelId
from constants.models import ClaudeModelId, GoogleModelId
from services.claude.evaluate_quality_checks import evaluate_quality_checks


Expand Down Expand Up @@ -61,3 +64,122 @@ def test_haiku_45_passes_model_and_max_tokens(mock_claude):
kwargs = _mock_claude_call(mock_claude, ClaudeModelId.HAIKU_4_5)
assert kwargs["model"] == "claude-haiku-4-5"
assert kwargs["max_tokens"] == MAX_OUTPUT_TOKENS[ClaudeModelId.HAIKU_4_5]


@pytest.mark.integration
def test_gemma_returns_case_coverage_for_real_file_pair():
"""Real Gemma call: verify case_coverage category is graded for a real source+test pair."""
repo_root = Path(__file__).resolve().parents[2]
src_path = "services/claude/forget_messages.py"
test_path = "services/claude/test_forget_messages.py"
src_content = (repo_root / src_path).read_text()
test_content = (repo_root / test_path).read_text()

result = evaluate_quality_checks(
source_content=src_content,
source_path=src_path,
test_files=[(test_path, test_content)],
model=GoogleModelId.GEMMA_4_31B,
)

assert result is not None
# Access directly — KeyError if missing
case_cov = result["case_coverage"]
assert set(case_cov.keys()) == {
"dimension_enumeration",
"combinatorial_matrix",
"explicit_expected_per_cell",
}
for check_name, check_data in case_cov.items():
assert check_data["status"] in {
"pass",
"fail",
"na",
}, f"{check_name} has invalid status {check_data.get('status')!r}"


# Shared source: a small pure function with 3 independent input dimensions
# (sign of amount, currency, customer_tier) -> 2*3*3 = 18 business cases.
_DISCOUNT_SRC = '''
def apply_discount(amount: float, currency: str, customer_tier: str) -> float:
"""Return discounted amount. Premium gets 20%, gold 10%, standard 0%.
Non-USD gets extra 5% off. Negative amounts return 0."""
if amount < 0:
return 0.0
discount = 0.0
if customer_tier == "premium":
discount = 0.20
elif customer_tier == "gold":
discount = 0.10
if currency != "USD":
discount += 0.05
return amount * (1 - discount)
'''

_WEAK_TEST = """
from src.discount import apply_discount

def test_premium_usd():
assert apply_discount(100.0, "USD", "premium") == 80.0
"""

_STRONG_TEST = """
import pytest
from src.discount import apply_discount

# Matrix: sign x currency x tier -> expected (derived from business rules, not code)
@pytest.mark.parametrize("amount,currency,tier,expected", [
# Negative amount early-returns 0 regardless of other dims (pruned)
(-1.0, "USD", "standard", 0.0),
# Positive amount, full matrix of currency x tier
(100.0, "USD", "standard", 100.0),
(100.0, "USD", "gold", 90.0),
(100.0, "USD", "premium", 80.0),
(100.0, "EUR", "standard", 95.0),
(100.0, "EUR", "gold", 85.0),
(100.0, "EUR", "premium", 75.0),
(100.0, "JPY", "standard", 95.0),
(100.0, "JPY", "gold", 85.0),
(100.0, "JPY", "premium", 75.0),
])
def test_discount_matrix(amount, currency, tier, expected):
assert apply_discount(amount, currency, tier) == expected
"""


@pytest.mark.integration
def test_gemma_discriminates_weak_vs_strong_case_coverage():
"""Gemma should grade a 1-case test worse than a full matrix on case_coverage."""
weak_result = evaluate_quality_checks(
source_content=_DISCOUNT_SRC,
source_path="src/discount.py",
test_files=[("tests/test_discount.py", _WEAK_TEST)],
model=GoogleModelId.GEMMA_4_31B,
)
strong_result = evaluate_quality_checks(
source_content=_DISCOUNT_SRC,
source_path="src/discount.py",
test_files=[("tests/test_discount.py", _STRONG_TEST)],
model=GoogleModelId.GEMMA_4_31B,
)

assert weak_result is not None and strong_result is not None
weak_cov = weak_result["case_coverage"]
strong_cov = strong_result["case_coverage"]

weak_fails = sum(1 for c in weak_cov.values() if c["status"] == "fail")
strong_fails = sum(1 for c in strong_cov.values() if c["status"] == "fail")

print("\nWEAK case_coverage:")
for name, data in weak_cov.items():
print(f" {name}: {data['status']} — {data.get('reason', '')}")
print("\nSTRONG case_coverage:")
for name, data in strong_cov.items():
print(f" {name}: {data['status']} — {data.get('reason', '')}")

# The weak test (1 case for 18-cell matrix) must fail more checks than the strong test.
assert (
weak_fails > strong_fails
), f"Gemma did not discriminate: weak_fails={weak_fails}, strong_fails={strong_fails}"
# The weak test must fail combinatorial_matrix specifically — it only has 1 case.
assert weak_cov["combinatorial_matrix"]["status"] == "fail"
5 changes: 5 additions & 0 deletions utils/quality_checks/checklist.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
QUALITY_CHECKLIST: dict[str, list[str]] = {
"case_coverage": [
"dimension_enumeration",
"combinatorial_matrix",
"explicit_expected_per_cell",
],
"integration": [
"db_operations_use_real_test_db",
"api_calls_tested_end_to_end",
Expand Down
96 changes: 69 additions & 27 deletions utils/quality_checks/test_checklist.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,34 +2,76 @@
from utils.quality_checks.get_checklist_hash import get_checklist_hash


def test_checklist_has_all_categories():
expected_categories = {
"integration",
"adversarial",
"security",
"performance",
"memory",
"error_handling",
"accessibility",
"business_logic",
"seo",
def test_checklist_matches_expected():
assert QUALITY_CHECKLIST == {
"case_coverage": [
"dimension_enumeration",
"combinatorial_matrix",
"explicit_expected_per_cell",
],
"integration": [
"db_operations_use_real_test_db",
"api_calls_tested_end_to_end",
"env_var_guards_for_secrets",
],
"business_logic": [
"domain_rules",
"state_transitions",
"calculation_accuracy",
"data_integrity",
"workflow_correctness",
],
"adversarial": [
"null_undefined_inputs",
"empty_strings_arrays",
"boundary_values",
"type_coercion",
"large_inputs",
"race_conditions",
"unicode_special_chars",
],
"security": [
"xss",
"sql_injection",
"command_injection",
"code_injection",
"csrf",
"auth_bypass",
"sensitive_data_exposure",
"untrusted_input_sanitization",
"open_redirects",
"path_traversal",
],
"performance": [
"quadratic_algorithms",
"heavy_sync_operations",
"n_plus_1_queries",
"large_imports",
"redundant_computation",
],
"memory": [
"event_listener_cleanup",
"subscription_timer_cleanup",
"circular_references",
"closure_retention",
],
"error_handling": [
"graceful_degradation",
"user_error_messages",
],
"accessibility": [
"aria_attributes",
"keyboard_navigation",
"screen_reader",
"focus_management",
],
"seo": [
"meta_tags",
"semantic_html",
"heading_hierarchy",
"alt_text",
],
}
assert set(QUALITY_CHECKLIST.keys()) == expected_categories


def test_each_category_has_checks():
for category, checks in QUALITY_CHECKLIST.items():
assert len(checks) > 0, f"Category '{category}' has no checks"
for check in checks:
assert isinstance(
check, str
), f"Check '{check}' in '{category}' is not a string"
assert len(check) > 0, f"Empty check name in '{category}'"


def test_no_duplicate_checks_within_categories():
for category, checks in QUALITY_CHECKLIST.items():
assert len(checks) == len(set(checks)), f"Duplicate checks in '{category}'"


def test_get_checklist_hash_returns_hex_string():
Expand Down
2 changes: 1 addition & 1 deletion uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.