domfahey · domfahey · Dec 28, 2025
diff --git a/scripts/analyze_duplicates.py b/scripts/analyze_duplicates.py
@@ -5,6 +5,7 @@
 from itertools import chain
 from pathlib import Path
 from typing import Any
+import hashlib
 
 from dex_python.deduplication import (
     find_birthday_name_duplicates,
@@ -37,6 +38,20 @@
     return {"id": contact_id, "name": "Unknown", "job": "N/A"}
 
 
+def _pseudonymize(value: str) -> str:
+    """Return a pseudonymous representation of a potentially sensitive value.
+
+    This avoids storing raw identifiers or PII in clear text while still
+    allowing consistent comparison within the report.
+    """
+    if value is None:
+        return "N/A"
+    text = str(value)
+    digest = hashlib.sha256(text.encode("utf-8")).hexdigest()[:8]
+    prefix = text[:3] if len(text) > 3 else text
+    return f"{prefix}…:{digest}"
+
+
 def write_group_to_file(
     f: Any, conn: sqlite3.Connection, group: dict[str, Any], title: str
 ) -> None:
@@ -46,7 +61,10 @@
     f.write("|---|---|---|\n")
     for cid in group["contact_ids"]:
         info = get_contact_summary(conn, cid)
-        f.write(f"| `{info['id']}` | {info['name']} | {info['job']} |\n")
+        masked_id = _pseudonymize(info.get("id"))
+        masked_name = _pseudonymize(info.get("name"))
+        masked_job = _pseudonymize(info.get("job"))
+        f.write(f"| `{masked_id}` | {masked_name} | {masked_job} |\n")
@@ -6,6 +6,8 @@
 from pathlib import Path
 from typing import Any
 import hashlib
+import secrets
+import base64
 from dex_python.deduplication import (
    find_birthday_name_duplicates,
@@ -20,7 +22,13 @@
 DEFAULT_DB_PATH = DATA_DIR / "dex_contacts.db"
 DEFAULT_REPORT_PATH = DATA_DIR / "DUPLICATE_REPORT.md"
+# Per-process random salt used to strengthen pseudonymization. This ensures
+# that the masked values in the report cannot be directly linked back to
+# database values via precomputed hashes, while remaining stable within
+# a single run of the script.
+PSEUDONYM_SALT = secrets.token_bytes(16)
+
 def get_contact_summary(conn: sqlite3.Connection, contact_id: str) -> dict[str, Any]:
    """Fetch basic info for a contact to display in the report."""
    cursor = conn.cursor()
@@ -43,15 +50,25 @@
    This avoids storing raw identifiers or PII in clear text while still
    allowing consistent comparison within the report.
+
+    The function produces a deterministic, non-reversible token for a given
+    input *within a single run* of the script, and does not expose any
+    substring of the original value.
    """
    if value is None:
        return "N/A"
    text = str(value)
-    digest = hashlib.sha256(text.encode("utf-8")).hexdigest()[:8]
-    prefix = text[:3] if len(text) > 3 else text
-    return f"{prefix}…:{digest}"
+    if not text:
+        return "N/A"
+    # Combine a per-process random salt with the value so that the resulting
+    # token cannot be matched against precomputed hashes of database values.
+    digest_bytes = hashlib.sha256(PSEUDONYM_SALT + text.encode("utf-8")).digest()
+    # Use a short, URL-safe base64 representation for readability.
+    token = base64.urlsafe_b64encode(digest_bytes).decode("ascii").rstrip("=")[:10]
+    return f"anon:{token}"
+
 def write_group_to_file(
    f: Any, conn: sqlite3.Connection, group: dict[str, Any], title: str
 ) -> None:
@@ -6,6 +6,8 @@
 from pathlib import Path
 from typing import Any
 import hashlib
+import secrets
+import base64

 from dex_python.deduplication import (
    find_birthday_name_duplicates,
@@ -20,7 +22,13 @@
 DEFAULT_DB_PATH = DATA_DIR / "dex_contacts.db"
 DEFAULT_REPORT_PATH = DATA_DIR / "DUPLICATE_REPORT.md"

+# Per-process random salt used to strengthen pseudonymization. This ensures
+# that the masked values in the report cannot be directly linked back to
+# database values via precomputed hashes, while remaining stable within
+# a single run of the script.
+PSEUDONYM_SALT = secrets.token_bytes(16)

+
 def get_contact_summary(conn: sqlite3.Connection, contact_id: str) -> dict[str, Any]:
    """Fetch basic info for a contact to display in the report."""
    cursor = conn.cursor()
@@ -43,15 +50,25 @@

    This avoids storing raw identifiers or PII in clear text while still
    allowing consistent comparison within the report.
+
+    The function produces a deterministic, non-reversible token for a given
+    input *within a single run* of the script, and does not expose any
+    substring of the original value.
    """
    if value is None:
        return "N/A"
    text = str(value)
-    digest = hashlib.sha256(text.encode("utf-8")).hexdigest()[:8]
-    prefix = text[:3] if len(text) > 3 else text
-    return f"{prefix}…:{digest}"
+    if not text:
+        return "N/A"

+    # Combine a per-process random salt with the value so that the resulting
+    # token cannot be matched against precomputed hashes of database values.
+    digest_bytes = hashlib.sha256(PSEUDONYM_SALT + text.encode("utf-8")).digest()
+    # Use a short, URL-safe base64 representation for readability.
+    token = base64.urlsafe_b64encode(digest_bytes).decode("ascii").rstrip("=")[:10]
+    return f"anon:{token}"

+
 def write_group_to_file(
    f: Any, conn: sqlite3.Connection, group: dict[str, Any], title: str
 ) -> None:
     f.write("\n")