Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions PythonScripts/audit_translations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,7 @@
Read README.md for more details.
"""

import sys

sys.stdout.reconfigure(encoding="utf-8")
from .cli import main # noqa: E402
from .cli import main

__all__ = [
"main",
Expand Down
109 changes: 36 additions & 73 deletions PythonScripts/audit_translations/auditor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,30 +5,12 @@
and for performing full language audits.
"""

import sys
from pathlib import Path

from rich.panel import Panel
from rich.table import Table

from .dataclasses import ComparisonResult, RuleInfo
from .parsers import diff_rules, parse_yaml_file
from .renderer import console, print_warnings

# Re-export console so existing `from .auditor import console` callers keep working.
__all__ = ["console"]

GREEN_FILE_COUNT_THRESHOLD = 7
YELLOW_FILE_COUNT_THRESHOLD = 4


def file_count_color(file_count: int) -> str:
"""Map number of translated YAML files to a display color."""
if file_count >= GREEN_FILE_COUNT_THRESHOLD:
return "green"
if file_count >= YELLOW_FILE_COUNT_THRESHOLD:
return "yellow"
return "red"
from .differ import diff_rules
from .models import AuditError, AuditSummary, ComparisonResult, RuleInfo
from .parsers import parse_yaml_file
from .renderer import console, print_audit_header, print_audit_summary, print_language_list, print_warnings


def split_language_into_base_and_region(language: str) -> tuple[str, str | None]:
Expand Down Expand Up @@ -72,16 +54,16 @@ def collect_from(directory: Path, root: Path) -> None:


def compare_files(
english_path: str,
translated_path: str,
english_path: Path,
translated_path: Path,
issue_filter: set[str] | None = None,
translated_region_path: str | None = None,
english_region_path: str | None = None,
translated_region_path: Path | None = None,
english_region_path: Path | None = None,
) -> ComparisonResult:
"""Compare English and translated YAML files"""

def load_rules(path: str | None) -> list[RuleInfo]:
if path and Path(path).exists():
def load_rules(path: Path | None) -> list[RuleInfo]:
if path and path.exists():
rules, _ = parse_yaml_file(path)
return rules
return []
Expand Down Expand Up @@ -149,7 +131,6 @@ def merge_rules(base_rules: list[RuleInfo], region_rules: list[RuleInfo]) -> lis
extra_rules=extra_rules,
untranslated_text=untranslated_text,
rule_differences=rule_differences,
file_path=translated_path,
english_rule_count=len(english_rules),
translated_rule_count=len(translated_rules),
)
Expand All @@ -172,24 +153,18 @@ def audit_language(
english_region_dir = english_dir / region if region else None

if not english_dir.exists():
console.print(f"\n[red]✗ Error:[/] English rules directory not found: {english_dir}")
sys.exit(1)
raise AuditError(f"English rules directory not found: {english_dir}")

if not translated_dir.exists():
console.print(f"\n[red]✗ Error:[/] Translation directory not found: {translated_dir}")
sys.exit(1)
raise AuditError(f"Translation directory not found: {translated_dir}")

if region and not (translated_region_dir and translated_region_dir.exists()):
console.print(f"\n[red]✗ Error:[/] Region directory not found: {translated_region_dir}")
sys.exit(1)
raise AuditError(f"Region directory not found: {translated_region_dir}")

# Get list of files to audit
files = [specific_file] if specific_file else get_yaml_files(english_dir, english_region_dir)

# Print header
console.print(Panel(f"MathCAT Translation Audit: {language.upper()}", style="bold cyan"))
console.print("\n [dim]Comparing against English (en) reference files[/]")
console.print(f" [dim]Files to check: {len(files)}[/]")
print_audit_header(language, len(files))

total_issues = 0
total_missing = 0
Expand All @@ -210,11 +185,11 @@ def audit_language(
continue

result = compare_files(
str(english_path),
str(translated_path),
english_path,
translated_path,
issue_filter,
str(translated_region_path) if translated_region_path and translated_region_path.exists() else None,
str(english_region_path) if english_region_path and english_region_path.exists() else None,
translated_region_path if translated_region_path and translated_region_path.exists() else None,
english_region_path if english_region_path and english_region_path.exists() else None,
)

if result.has_issues:
Expand All @@ -226,52 +201,40 @@ def audit_language(
files_ok += 1

total_missing += len(result.missing_rules)
total_untranslated += sum(len(entries) for _, entries in result.untranslated_text)
total_untranslated += sum(len(entries) for _rule, entries in result.untranslated_text)
total_extra += len(result.extra_rules)
total_differences += len(result.rule_differences)

# Summary
table = Table(title="SUMMARY", title_style="bold", box=None, show_header=False, padding=(0, 2))
table.add_column(width=30)
table.add_column()
for label, value, color in [
("Files checked", len(files), None),
("Files with issues", files_with_issues, "yellow" if files_with_issues else "green"),
("Files OK", files_ok, "green" if files_ok else None),
("Missing rules", total_missing, "red" if total_missing else "green"),
("Untranslated text", total_untranslated, "yellow" if total_untranslated else "green"),
("Rule differences", total_differences, "magenta" if total_differences else "green"),
("Extra rules", total_extra, "blue" if total_extra else None),
]:
table.add_row(label, f"[{color}]{value}[/]" if color else str(value))
console.print(Panel(table, style="cyan"))
print_audit_summary(
AuditSummary(
files_checked=len(files),
files_with_issues=files_with_issues,
files_ok=files_ok,
total_missing=total_missing,
total_untranslated=total_untranslated,
total_extra=total_extra,
total_differences=total_differences,
total_issues=total_issues,
)
)

return total_issues


def list_languages(rules_dir: str | None = None) -> None:
"""List available languages for auditing"""
console.print(Panel("Available Languages", style="bold cyan"))

table = Table(show_header=True, header_style="dim")
table.add_column("Language", justify="center", style="cyan")
table.add_column("YAML files", justify="right")

rules_dir_path = get_rules_dir(rules_dir)

languages: list[tuple[str, int]] = []
for lang_dir in sorted(rules_dir_path.iterdir()):
if not lang_dir.is_dir() or lang_dir.name == "en":
continue
base_count = len(get_yaml_files(lang_dir))
color = file_count_color(base_count)
table.add_row(lang_dir.name, f"[{color}]{base_count}[/] files")
languages.append((lang_dir.name, len(get_yaml_files(lang_dir))))

for region_dir in sorted(lang_dir.iterdir()):
if not region_dir.is_dir() or region_dir.name.lower() == "sharedrules":
continue
code = f"{lang_dir.name}-{region_dir.name}"
count = len(get_yaml_files(lang_dir, region_dir))
region_color = file_count_color(count)
table.add_row(code, f"[{region_color}]{count}[/] files")
languages.append((code, len(get_yaml_files(lang_dir, region_dir))))

console.print(table)
console.print("\n [dim]Reference: en (English) - base translation[/]\n")
print_language_list(languages)
27 changes: 15 additions & 12 deletions PythonScripts/audit_translations/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,14 @@
import argparse
import sys

from .auditor import audit_language, console, list_languages
from .auditor import audit_language, list_languages
from .models import AuditError
from .renderer import console


def main() -> None:
"""Main entry point for the audit tool"""
sys.stdout.reconfigure(encoding="utf-8")

parser = argparse.ArgumentParser(
description="Audit MathCAT translation files against English originals",
Expand Down Expand Up @@ -58,14 +61,14 @@ def main() -> None:
sys.exit(1)
issue_filter = set(tokens)

audit_language(
args.language,
args.specific_file,
args.rules_dir,
issue_filter,
args.verbose,
)


if __name__ == "__main__":
main()
try:
audit_language(
args.language,
args.specific_file,
args.rules_dir,
issue_filter,
args.verbose,
)
except AuditError as exc:
console.print(f"\n[red]✗ Error:[/] {exc}")
sys.exit(1)
98 changes: 98 additions & 0 deletions PythonScripts/audit_translations/differ.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
"""
Rule diffing logic.

Compares English and translated rules to find fine-grained structural differences.
"""

from .extractors import (
extract_conditions,
extract_match_pattern,
extract_structure_elements,
extract_variables,
normalize_xpath,
)
from .models import DiffType, RuleDifference, RuleInfo


def dedup_list(values: list[str]) -> list[str]:
"""
Return a list without duplicates while preserving first-seen order.
Originally, rule differences were stored as sets, losing their original order,
which is not helpful and why it changed with the help of this function.

Example:
>>> dedup_list(["if:a", "if:b", "if:a"])
['if:a', 'if:b']
"""
return list(dict.fromkeys(values)) # dict preserves insertion order (guaranteed in Python 3.7+)


def diff_rules(english_rule: RuleInfo, translated_rule: RuleInfo) -> list[RuleDifference]:
"""
Compare two rules and return fine-grained differences.
Ignores text content differences (T/t values) but catches structural changes.
"""
differences: list[RuleDifference] = []

def add_difference(diff_type: DiffType, description: str, english_snippet: str, translated_snippet: str) -> None:
differences.append(
RuleDifference(
english_rule,
translated_rule,
diff_type,
description,
english_snippet,
translated_snippet,
)
)

# Check match pattern differences
en_match_raw = extract_match_pattern(english_rule.data)
tr_match_raw = extract_match_pattern(translated_rule.data)
en_match = normalize_xpath(en_match_raw)
tr_match = normalize_xpath(tr_match_raw)
if en_match != tr_match and en_match and tr_match:
add_difference(DiffType.MATCH, "Match pattern differs", en_match, tr_match)

# Check condition differences
en_conditions_raw = extract_conditions(english_rule.data)
tr_conditions_raw = extract_conditions(translated_rule.data)
en_conditions = [normalize_xpath(c) for c in en_conditions_raw]
tr_conditions = [normalize_xpath(c) for c in tr_conditions_raw]
if en_conditions != tr_conditions:
# Find specific differences
en_set, tr_set = set(en_conditions), set(tr_conditions)
if en_set != tr_set:
add_difference(
DiffType.CONDITION,
"Conditions differ",
", ".join(dedup_list(en_conditions)) or "(none)",
", ".join(dedup_list(tr_conditions)) or "(none)",
)

# Check variable differences
en_vars = extract_variables(english_rule.data)
tr_vars = extract_variables(translated_rule.data)
if en_vars != tr_vars:
en_var_names = {v[0] for v in en_vars}
tr_var_names = {v[0] for v in tr_vars}
if en_var_names != tr_var_names:
add_difference(
DiffType.VARIABLES,
"Variable definitions differ",
", ".join(sorted(en_var_names)) or "(none)",
", ".join(sorted(tr_var_names)) or "(none)",
)

# Check structural differences (test/if/then/else blocks)
en_structure = extract_structure_elements(english_rule.data)
tr_structure = extract_structure_elements(translated_rule.data)
if en_structure != tr_structure:
add_difference(
DiffType.STRUCTURE,
"Rule structure differs (test/if/then/else blocks)",
" ".join(en_structure),
" ".join(tr_structure),
)

return differences
Loading