Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 31 additions & 31 deletions scripts/analyze_duplicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,16 +36,16 @@ def get_contact_summary(conn: sqlite3.Connection, contact_id: str) -> dict[str,


def write_group_to_file(
f: Any, conn: sqlite3.Connection, group: dict[str, Any], title: str
report_file: Any, conn: sqlite3.Connection, group: dict[str, Any], title: str
) -> None:
"""Helper to write a duplicate group to the report file."""
f.write(f"### {title}: `{group['match_value']}`\n")
f.write("| ID | Name | Job Title |\n")
f.write("|---|---|---|\n")
for cid in group["contact_ids"]:
info = get_contact_summary(conn, cid)
f.write(f"| `{info['id']}` | {info['name']} | {info['job']} |\n")
f.write("\n")
report_file.write(f"### {title}: `{group['match_value']}`\n")
report_file.write("| ID | Name | Job Title |\n")
report_file.write("|---|---|---|\n")
for contact_id in group["contact_ids"]:
info = get_contact_summary(conn, contact_id)
report_file.write(f"| `{info['id']}` | {info['name']} | {info['job']} |\n")
report_file.write("\n")


def generate_report(db_path: str, output_path: str) -> None:
Expand Down Expand Up @@ -88,44 +88,44 @@ def generate_report(db_path: str, output_path: str) -> None:
print(f"Total contacts flagged as potential duplicates: {len(all_dupe_ids)}")

# Generate Markdown Report
with open(output_path, "w") as f:
f.write("# Comprehensive Duplicate Contact Report\n\n")
f.write(f"**Database:** `{db_path}`\n")
f.write(f"**Total Flagged Contacts:** {len(all_dupe_ids)}\n\n")
with open(output_path, "w") as report_file:
report_file.write("# Comprehensive Duplicate Contact Report\n\n")
report_file.write(f"**Database:** `{db_path}`\n")
report_file.write(f"**Total Flagged Contacts:** {len(all_dupe_ids)}\n\n")

f.write("## Level 1: Exact Matches (Highest Confidence)\n")
f.write("### Shared Emails\n")
report_file.write("## Level 1: Exact Matches (Highest Confidence)\n")
report_file.write("### Shared Emails\n")
if not email_dupes:
f.write("_No shared emails found._\n")
report_file.write("_No shared emails found._\n")
for group in email_dupes:
write_group_to_file(f, conn, group, "Email")
write_group_to_file(report_file, conn, group, "Email")

f.write("### Shared Phones\n")
report_file.write("### Shared Phones\n")
if not phone_dupes:
f.write("_No shared phone numbers found._\n")
report_file.write("_No shared phone numbers found._\n")
for group in phone_dupes:
write_group_to_file(f, conn, group, "Phone")
write_group_to_file(report_file, conn, group, "Phone")

f.write("## Level 1.5: Name + Birthday (High Confidence)\n")
f.write("### Same Name and Birthday\n")
report_file.write("## Level 1.5: Name + Birthday (High Confidence)\n")
report_file.write("### Same Name and Birthday\n")
if not birthday_dupes:
f.write("_No name + birthday duplicates found._\n")
report_file.write("_No name + birthday duplicates found._\n")
for group in birthday_dupes:
write_group_to_file(f, conn, group, "Birthday")
write_group_to_file(report_file, conn, group, "Birthday")

f.write("## Level 2: Rule-Based Matches (Medium Confidence)\n")
f.write("### Shared Name + Job Title\n")
report_file.write("## Level 2: Rule-Based Matches (Medium Confidence)\n")
report_file.write("### Shared Name + Job Title\n")
if not name_title_dupes:
f.write("_No Name + Job Title duplicates found._\n")
report_file.write("_No Name + Job Title duplicates found._\n")
for group in name_title_dupes:
write_group_to_file(f, conn, group, "Match")
write_group_to_file(report_file, conn, group, "Match")

f.write("## Level 3: Fuzzy Matches (Lower Confidence)\n")
f.write("### Fuzzy Name Matches (Jaro-Winkler > 0.95)\n")
report_file.write("## Level 3: Fuzzy Matches (Lower Confidence)\n")
report_file.write("### Fuzzy Name Matches (Jaro-Winkler > 0.95)\n")
if not fuzzy_dupes:
f.write("_No fuzzy name duplicates found._\n")
report_file.write("_No fuzzy name duplicates found._\n")
for group in fuzzy_dupes:
write_group_to_file(f, conn, group, "Fuzzy Match")
write_group_to_file(report_file, conn, group, "Fuzzy Match")

conn.close()
print(f"Report generated: {output_path}")
Expand Down
54 changes: 28 additions & 26 deletions src/dex_python/deduplication.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,40 +209,42 @@ def find_fuzzy_name_duplicates(
rows = cursor.fetchall()

blocks: dict[str, list[dict[str, str]]] = {}
for rid, first, last in rows:
for contact_id, first, last in rows:
first, last = first.strip(), last.strip()

# Skip empty names after stripping
if not first or not last:
continue

try:
key = jellyfish.metaphone(last) or last.lower()[:2]
phonetic_key = jellyfish.metaphone(last) or last.lower()[:2]
except Exception:
key = last.lower()[:2]
phonetic_key = last.lower()[:2]

if key not in blocks:
blocks[key] = []
blocks[key].append({"id": rid, "full_name": f"{first} {last}"})
if phonetic_key not in blocks:
blocks[phonetic_key] = []
blocks[phonetic_key].append({"id": contact_id, "full_name": f"{first} {last}"})

results = []
for items in blocks.values():
if len(items) < 2:
continue
for i in range(len(items)):
for j in range(i + 1, len(items)):
p1, p2 = items[i], items[j]
for first_index in range(len(items)):
for second_index in range(first_index + 1, len(items)):
contact1, contact2 = items[first_index], items[second_index]
score = jellyfish.jaro_winkler_similarity(
p1["full_name"], p2["full_name"]
contact1["full_name"], contact2["full_name"]
)
if score >= threshold:
match_value = (
f"{contact1['full_name']} <-> "
f"{contact2['full_name']} ({score:.2f})"
)
results.append(
{
"match_type": "fuzzy_name",
"match_value": (
f"{p1['full_name']} <-> {p2['full_name']} ({score:.2f})"
),
"contact_ids": [p1["id"], p2["id"]],
"match_value": match_value,
"contact_ids": [contact1["id"], contact2["id"]],
}
)
return results
Expand All @@ -262,11 +264,11 @@ def cluster_duplicates(matches: list[dict[str, Any]]) -> list[list[str]]:
"""
graph: nx.Graph[str] = nx.Graph()
for match in matches:
ids = match["contact_ids"]
for i in range(len(ids)):
for j in range(i + 1, len(ids)):
graph.add_edge(ids[i], ids[j])
return [list(c) for c in nx.connected_components(graph)]
contact_ids = match["contact_ids"]
for first_index in range(len(contact_ids)):
for second_index in range(first_index + 1, len(contact_ids)):
graph.add_edge(contact_ids[first_index], contact_ids[second_index])
return [list(cluster) for cluster in nx.connected_components(graph)]


def merge_cluster(
Expand Down Expand Up @@ -303,12 +305,12 @@ def merge_cluster(

if primary_id:
# Find the row corresponding to primary_id
primary_row_list = [r for r in rows if r[0] == primary_id]
primary_row_list = [row for row in rows if row[0] == primary_id]
if not primary_row_list:
raise ValueError(f"Primary ID {primary_id} not found in contact cluster")
primary_row = primary_row_list[0]
# Remove primary from candidates to merge FROM
other_rows = [r for r in rows if r[0] != primary_id]
other_rows = [row for row in rows if row[0] != primary_id]
sorted_rows = [primary_row] + other_rows
else:
# Auto-select best primary
Expand All @@ -321,11 +323,11 @@ def score_row(row: tuple[Any, ...]) -> int:

current_primary = list(primary_row)
for other_row in sorted_rows[1:]:
for i in range(len(current_primary)):
if (current_primary[i] is None or current_primary[i] == "") and other_row[
i
]:
current_primary[i] = other_row[i]
for field_index in range(len(current_primary)):
current_field = current_primary[field_index]
is_empty = current_field is None or current_field == ""
if is_empty and other_row[field_index]:
current_primary[field_index] = other_row[field_index]

cursor.execute(
"""
Expand Down
6 changes: 3 additions & 3 deletions src/dex_python/sync_back.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ def sync_as_notes(
stats = {"created": 0, "skipped": 0, "errors": 0}
total = len(contacts)

for i, contact in enumerate(contacts):
for current_index, contact in enumerate(contacts):
note_text = build_enrichment_note(contact["company"], contact["role"])
if not note_text:
stats["skipped"] += 1
Expand All @@ -192,8 +192,8 @@ def sync_as_notes(
except Exception:
stats["errors"] += 1

if progress_callback and (i + 1) % 100 == 0:
progress_callback(i + 1, total, stats)
if progress_callback and (current_index + 1) % 100 == 0:
progress_callback(current_index + 1, total, stats)

return stats

Expand Down