Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion scripts/analyze_duplicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from itertools import chain
from pathlib import Path
from typing import Any
import hashlib

from dex_python.deduplication import (
find_birthday_name_duplicates,
Expand Down Expand Up @@ -37,6 +38,20 @@
return {"id": contact_id, "name": "Unknown", "job": "N/A"}


def _pseudonymize(value: str) -> str:
"""Return a pseudonymous representation of a potentially sensitive value.

This avoids storing raw identifiers or PII in clear text while still
allowing consistent comparison within the report.
"""
if value is None:
return "N/A"
text = str(value)
digest = hashlib.sha256(text.encode("utf-8")).hexdigest()[:8]
prefix = text[:3] if len(text) > 3 else text
return f"{prefix}…:{digest}"


def write_group_to_file(
f: Any, conn: sqlite3.Connection, group: dict[str, Any], title: str
) -> None:
Expand All @@ -46,7 +61,10 @@
f.write("|---|---|---|\n")
for cid in group["contact_ids"]:
info = get_contact_summary(conn, cid)
f.write(f"| `{info['id']}` | {info['name']} | {info['job']} |\n")
masked_id = _pseudonymize(info.get("id"))
masked_name = _pseudonymize(info.get("name"))
masked_job = _pseudonymize(info.get("job"))
f.write(f"| `{masked_id}` | {masked_name} | {masked_job} |\n")

Check failure

Code scanning / CodeQL

Clear-text storage of sensitive information High

This expression stores
sensitive data (private)
as clear text.

Copilot Autofix

AI 4 months ago

In general, to fix clear-text storage of sensitive information in reports/logs, you either (1) avoid including the sensitive fields altogether, or (2) replace them with non-reversible pseudonyms that cannot be used to reconstruct the original values and do not expose any direct substring of the data. Deterministic, secret-key–based tokens are better than raw hashes, and exposing no raw prefix at all is safer than exposing a few characters.

For this specific code, the problem is localized in _pseudonymize and its use in write_group_to_file. We can fix the issue without changing existing functionality in a user-visible way by tightening _pseudonymize so that it no longer includes any portion of the original string, and only outputs a fixed label plus a short, deterministic pseudonymous token derived from the input. This keeps stable equality (the same input always yields the same masked output), so duplicate grouping and report readability (“same masked name appears several times”) are preserved, while eliminating direct leakage of clear-text characters. To further reduce the risk of offline guessing attacks, we can introduce a process-local random salt, so that the digest used for masking cannot be precomputed from the raw database values; since the salt only needs to be consistent within a single run (for generating this report), a single generated salt in the module scope is sufficient and does not require persistent storage.

Concretely, in scripts/analyze_duplicates.py:

  • Add an import for secrets and base64 (both from the standard library) alongside the existing imports.
  • Add a module-level random salt (e.g., PSEUDONYM_SALT = secrets.token_bytes(16)).
  • Rewrite _pseudonymize to:
    • Handle None / empty the same as today ("N/A").
    • Compute digest = hashlib.sha256(PSEUDONYM_SALT + text.encode("utf-8")).digest().
    • Encode a short prefix of the digest with URL-safe base64 (or hex) and truncate it for readability (e.g., 10 chars).
    • Return a generic label like anon:<token> or pseudonym:<token> without embedding any substring of text.
  • Leave write_group_to_file unchanged; it will automatically start writing the safer tokens instead of the current "prefix…:digest" values.

This modification is restricted to the shown file and lines, uses only standard-library modules, preserves the behavior that identical inputs yield identical masked outputs, and removes clear-text leakage that CodeQL flags.


Suggested changeset 1
scripts/analyze_duplicates.py

Autofix patch

Autofix patch
Run the following command in your local git repository to apply this patch
cat << 'EOF' | git apply
diff --git a/scripts/analyze_duplicates.py b/scripts/analyze_duplicates.py
--- a/scripts/analyze_duplicates.py
+++ b/scripts/analyze_duplicates.py
@@ -6,6 +6,8 @@
 from pathlib import Path
 from typing import Any
 import hashlib
+import secrets
+import base64
 
 from dex_python.deduplication import (
     find_birthday_name_duplicates,
@@ -20,7 +22,13 @@
 DEFAULT_DB_PATH = DATA_DIR / "dex_contacts.db"
 DEFAULT_REPORT_PATH = DATA_DIR / "DUPLICATE_REPORT.md"
 
+# Per-process random salt used to strengthen pseudonymization. This ensures
+# that the masked values in the report cannot be directly linked back to
+# database values via precomputed hashes, while remaining stable within
+# a single run of the script.
+PSEUDONYM_SALT = secrets.token_bytes(16)
 
+
 def get_contact_summary(conn: sqlite3.Connection, contact_id: str) -> dict[str, Any]:
     """Fetch basic info for a contact to display in the report."""
     cursor = conn.cursor()
@@ -43,15 +50,25 @@
 
     This avoids storing raw identifiers or PII in clear text while still
     allowing consistent comparison within the report.
+
+    The function produces a deterministic, non-reversible token for a given
+    input *within a single run* of the script, and does not expose any
+    substring of the original value.
     """
     if value is None:
         return "N/A"
     text = str(value)
-    digest = hashlib.sha256(text.encode("utf-8")).hexdigest()[:8]
-    prefix = text[:3] if len(text) > 3 else text
-    return f"{prefix}…:{digest}"
+    if not text:
+        return "N/A"
 
+    # Combine a per-process random salt with the value so that the resulting
+    # token cannot be matched against precomputed hashes of database values.
+    digest_bytes = hashlib.sha256(PSEUDONYM_SALT + text.encode("utf-8")).digest()
+    # Use a short, URL-safe base64 representation for readability.
+    token = base64.urlsafe_b64encode(digest_bytes).decode("ascii").rstrip("=")[:10]
+    return f"anon:{token}"
 
+
 def write_group_to_file(
     f: Any, conn: sqlite3.Connection, group: dict[str, Any], title: str
 ) -> None:
EOF
@@ -6,6 +6,8 @@
from pathlib import Path
from typing import Any
import hashlib
import secrets
import base64

from dex_python.deduplication import (
find_birthday_name_duplicates,
@@ -20,7 +22,13 @@
DEFAULT_DB_PATH = DATA_DIR / "dex_contacts.db"
DEFAULT_REPORT_PATH = DATA_DIR / "DUPLICATE_REPORT.md"

# Per-process random salt used to strengthen pseudonymization. This ensures
# that the masked values in the report cannot be directly linked back to
# database values via precomputed hashes, while remaining stable within
# a single run of the script.
PSEUDONYM_SALT = secrets.token_bytes(16)


def get_contact_summary(conn: sqlite3.Connection, contact_id: str) -> dict[str, Any]:
"""Fetch basic info for a contact to display in the report."""
cursor = conn.cursor()
@@ -43,15 +50,25 @@

This avoids storing raw identifiers or PII in clear text while still
allowing consistent comparison within the report.

The function produces a deterministic, non-reversible token for a given
input *within a single run* of the script, and does not expose any
substring of the original value.
"""
if value is None:
return "N/A"
text = str(value)
digest = hashlib.sha256(text.encode("utf-8")).hexdigest()[:8]
prefix = text[:3] if len(text) > 3 else text
return f"{prefix}…:{digest}"
if not text:
return "N/A"

# Combine a per-process random salt with the value so that the resulting
# token cannot be matched against precomputed hashes of database values.
digest_bytes = hashlib.sha256(PSEUDONYM_SALT + text.encode("utf-8")).digest()
# Use a short, URL-safe base64 representation for readability.
token = base64.urlsafe_b64encode(digest_bytes).decode("ascii").rstrip("=")[:10]
return f"anon:{token}"


def write_group_to_file(
f: Any, conn: sqlite3.Connection, group: dict[str, Any], title: str
) -> None:
Copilot is powered by AI and may make mistakes. Always verify output.
f.write("\n")


Expand Down